244 files changed, 8251 insertions, 5757 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/removed/o2cb
index 9c49d8e..7f5daa4 100644
--- a/Documentation/ABI/obsolete/o2cb
+++ b/Documentation/ABI/removed/o2cb
@@ -1,11 +1,10 @@
 What:		/sys/o2cb symlink
-Date:		Dec 2005
-KernelVersion:	2.6.16
+Date:		May 2011
+KernelVersion:	2.6.40
 Contact:	ocfs2-devel@oss.oracle.com
-Description:	This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
-		be removed when new versions of ocfs2-tools which know to look
+Description:	This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
+		removed when new versions of ocfs2-tools which know to look
 		in /sys/fs/o2cb are sufficiently prevalent. Don't code new
 		software to look here, it should try /sys/fs/o2cb instead.
-		See Documentation/ABI/stable/o2cb for more information on usage.
 Users:		ocfs2-tools. It's sufficient to mail proposed changes to
 		ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
new file mode 100644
index 0000000..662ae64
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
@@ -0,0 +1,11 @@
+What:		/sys/kernel/mm/cleancache/
+Date:		April 2011
+Contact:	Dan Magenheimer <dan.magenheimer@oracle.com>
+Description:
+		/sys/kernel/mm/cleancache/ contains a number of files which
+		record a count of various cleancache operations
+		(sum across all filesystems):
+			succ_gets
+			failed_gets
+			puts
+			flushes
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 95788ad..ff31b1c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -262,16 +262,6 @@ Who:	Michael Buesch <mb@bu3sch.de>
 
 ---------------------------
 
-What:	/sys/o2cb symlink
-When:	January 2010
-Why:	/sys/fs/o2cb is the proper location for this information - /sys/o2cb
-	exists as a symlink for backwards compatibility for old versions of
-	ocfs2-tools. 2 years should be sufficient time to phase in new versions
-	which know to look in /sys/fs/o2cb.
-Who:	ocfs2-devel@oss.oracle.com
-
----------------------------
-
 What:	Ability for non root users to shm_get hugetlb pages based on mlock
 	resource limits
 When:	2.6.31
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index c79ec58..3ae9bc9 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -226,10 +226,6 @@ acl			Enables POSIX Access Control Lists support.
 noacl			This option disables POSIX Access Control List
 			support.
 
-reservation
-
-noreservation
-
 bsddf		(*)	Make 'df' act like BSD.
 minixdf			Make 'df' act like Minix.
 
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 9ed920a..7618a28 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -46,9 +46,15 @@ errors=panic		Panic and halt the machine if an error occurs.
 intr		(*)	Allow signals to interrupt cluster operations.
 nointr			Do not allow signals to interrupt cluster
 			operations.
+noatime			Do not update access time.
+relatime(*)		Update atime if the previous atime is older than
+			mtime or ctime
+strictatime		Always update atime, but the minimum update interval
+			is specified by atime_quantum.
 atime_quantum=60(*)	OCFS2 will not update atime unless this number
 			of seconds has passed since the last update.
-			Set to zero to always update atime.
+			Set to zero to always update atime. This option need
+			work with strictatime.
 data=ordered	(*)	All data are forced directly out to the main file
 			system prior to its metadata being committed to the
 			journal.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 7bff3e4..3fc0c31 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -39,6 +39,12 @@ When mounting an XFS filesystem, the following options are accepted.
 	drive level write caching to be enabled, for devices that
 	support write barriers.
 
+  discard
+	Issue command to let the block device reclaim space freed by the
+	filesystem.  This is useful for SSD devices, thinly provisioned
+	LUNs and virtual machine images, but may have a performance
+	impact.  This option is incompatible with the nodelaylog option.
+
   dmapi
 	Enable the DMAPI (Data Management API) event callouts.
 	Use with the "mtpt" option.
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
new file mode 100644
index 0000000..36c367c
--- /dev/null
+++ b/Documentation/vm/cleancache.txt
@@ -0,0 +1,278 @@
+MOTIVATION
+
+Cleancache is a new optional feature provided by the VFS layer that
+potentially dramatically increases page cache effectiveness for
+many workloads in many environments at a negligible cost.
+
+Cleancache can be thought of as a page-granularity victim cache for clean
+pages that the kernel's pageframe replacement algorithm (PFRA) would like
+to keep around, but can't since there isn't enough memory.  So when the
+PFRA "evicts" a page, it first attempts to use cleancache code to
+put the data contained in that page into "transcendent memory", memory
+that is not directly accessible or addressable by the kernel and is
+of unknown and possibly time-varying size.
+
+Later, when a cleancache-enabled filesystem wishes to access a page
+in a file on disk, it first checks cleancache to see if it already
+contains it; if it does, the page of data is copied into the kernel
+and a disk access is avoided.
+
+Transcendent memory "drivers" for cleancache are currently implemented
+in Xen (using hypervisor memory) and zcache (using in-kernel compressed
+memory) and other implementations are in development.
+
+FAQs are included below.
+
+IMPLEMENTATION OVERVIEW
+
+A cleancache "backend" that provides transcendent memory registers itself
+to the kernel's cleancache "frontend" by calling cleancache_register_ops,
+passing a pointer to a cleancache_ops structure with funcs set appropriately.
+Note that cleancache_register_ops returns the previous settings so that
+chaining can be performed if desired. The functions provided must conform to
+certain semantics as follows:
+
+Most important, cleancache is "ephemeral".  Pages which are copied into
+cleancache have an indefinite lifetime which is completely unknowable
+by the kernel and so may or may not still be in cleancache at any later time.
+Thus, as its name implies, cleancache is not suitable for dirty pages.
+Cleancache has complete discretion over what pages to preserve and what
+pages to discard and when.
+
+Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
+pool id which, if positive, must be saved in the filesystem's superblock;
+a negative return value indicates failure.  A "put_page" will copy a
+(presumably about-to-be-evicted) page into cleancache and associate it with
+the pool id, a file key, and a page index into the file.  (The combination
+of a pool id, a file key, and an index is sometimes called a "handle".)
+A "get_page" will copy the page, if found, from cleancache into kernel memory.
+A "flush_page" will ensure the page no longer is present in cleancache;
+a "flush_inode" will flush all pages associated with the specified file;
+and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
+all files specified by the given pool id and also surrender the pool id.
+
+An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
+to treat the pool as shared using a 128-bit UUID as a key.  On systems
+that may run multiple kernels (such as hard partitioned or virtualized
+systems) that may share a clustered filesystem, and where cleancache
+may be shared among those kernels, calls to init_shared_fs that specify the
+same UUID will receive the same pool id, thus allowing the pages to
+be shared.  Note that any security requirements must be imposed outside
+of the kernel (e.g. by "tools" that control cleancache).  Or a
+cleancache implementation can simply disable shared_init by always
+returning a negative value.
+
+If a get_page is successful on a non-shared pool, the page is flushed (thus
+making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT flushed on a successful get_page so that it remains accessible to
+other sharers.  The kernel is responsible for ensuring coherency between
+cleancache (shared or not), the page cache, and the filesystem, using
+cleancache flush operations as required.
+
+Note that cleancache must enforce put-put-get coherency and get-get
+coherency.  For the former, if two puts are made to the same handle but
+with different data, say AAA by the first put and BBB by the second, a
+subsequent get can never return the stale data (AAA).  For get-get coherency,
+if a get for a given handle fails, subsequent gets for that handle will
+never succeed unless preceded by a successful put with that handle.
+
+Last, cleancache provides no SMP serialization guarantees; if two
+different Linux threads are simultaneously putting and flushing a page
+with the same handle, the results are indeterminate.  Callers must
+lock the page to ensure serial behavior.
+
+CLEANCACHE PERFORMANCE METRICS
+
+Cleancache monitoring is done by sysfs files in the
+/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
+can be measured (across all filesystems) with:
+
+succ_gets	- number of gets that were successful
+failed_gets	- number of gets that failed
+puts		- number of puts attempted (all "succeed")
+flushes		- number of flushes attempted
+
+A backend implementatation may provide additional metrics.
+
+FAQ
+
+1) Where's the value? (Andrew Morton)
+
+Cleancache provides a significant performance benefit to many workloads
+in many environments with negligible overhead by improving the
+effectiveness of the pagecache.  Clean pagecache pages are
+saved in transcendent memory (RAM that is otherwise not directly
+addressable to the kernel); fetching those pages later avoids "refaults"
+and thus disk reads.
+
+Cleancache (and its sister code "frontswap") provide interfaces for
+this transcendent memory (aka "tmem"), which conceptually lies between
+fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
+Disallowing direct kernel or userland reads/writes to tmem
+is ideal when data is transformed to a different form and size (such
+as with compression) or secretly moved (as might be useful for write-
+balancing for some RAM-like devices).  Evicted page-cache pages (and
+swap pages) are a great use for this kind of slower-than-RAM-but-much-
+faster-than-disk transcendent memory, and the cleancache (and frontswap)
+"page-object-oriented" specification provides a nice way to read and
+write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to
+do it well with no kernel change have essentially failed (except in some
+well-publicized special-case workloads).  Cleancache -- and frontswap --
+with a fairly small impact on the kernel, provide a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), page cache pages
+are the first to go, and cleancache allows those pages to be
+saved and reclaimed if overall host system memory conditions allow.
+
+And the identical interface used for cleancache can be used in
+physical systems as well.  The zcache driver acts as a memory-hungry
+device that stores pages of data in a compressed state.  And
+the proposed "RAMster" driver shares RAM across multiple physical
+systems.
+
+2) Why does cleancache have its sticky fingers so deep inside the
+   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+
+The core hooks for cleancache in VFS are in most cases a single line
+and the minimum set are placed precisely where needed to maintain
+coherency (via cleancache_flush operations) between cleancache,
+the page cache, and disk.  All hooks compile into nothingness if
+cleancache is config'ed off and turn into a function-pointer-
+compare-to-NULL if config'ed on but no backend claims the ops
+functions, or to a compare-struct-element-to-negative if a
+backend claims the ops functions but a filesystem doesn't enable
+cleancache.
+
+Some filesystems are built entirely on top of VFS and the hooks
+in VFS are sufficient, so don't require an "init_fs" hook; the
+initial implementation of cleancache didn't provide this hook.
+But for some filesystems (such as btrfs), the VFS hooks are
+incomplete and one or more hooks in fs-specific code are required.
+And for some other filesystems, such as tmpfs, cleancache may
+be counterproductive.  So it seemed prudent to require a filesystem
+to "opt in" to use cleancache, which requires adding a hook in
+each filesystem.  Not all filesystems are supported by cleancache
+only because they haven't been tested.  The existing set should
+be sufficient to validate the concept, the opt-in approach means
+that untested filesystems are not affected, and the hooks in the
+existing filesystems should make it very easy to add more
+filesystems in the future.
+
+The total impact of the hooks to existing fs and mm files is only
+about 40 lines added (not counting comments and blank lines).
+
+3) Why not make cleancache asynchronous and batched so it can
+   more easily interface with real devices with DMA instead
+   of copying each individual page? (Minchan Kim)
+
+The one-page-at-a-time copy semantics simplifies the implementation
+on both the frontend and backend and also allows the backend to
+do fancy things on-the-fly like page compression and
+page deduplication.  And since the data is "gone" (copied into/out
+of the pageframe) before the cleancache get/put call returns,
+a great deal of race conditions and potential coherency issues
+are avoided.  While the interface seems odd for a "real device"
+or for real kernel-addressable RAM, it makes perfect sense for
+transcendent memory.
+
+4) Why is non-shared cleancache "exclusive"?  And where is the
+   page "flushed" after a "get"? (Minchan Kim)
+
+The main reason is to free up space in transcendent memory and
+to avoid unnecessary cleancache_flush calls.  If you want inclusive,
+the page can be "put" immediately following the "get".  If
+put-after-get for inclusive becomes common, the interface could
+be easily extended to add a "get_no_flush" call.
+
+The flush is done by the cleancache backend implementation.
+
+5) What's the performance impact?
+
+Performance analysis has been presented at OLS'09 and LCA'10.
+Briefly, performance gains can be significant on most workloads,
+especially when memory pressure is high (e.g. when RAM is
+overcommitted in a virtual workload); and because the hooks are
+invoked primarily in place of or in addition to a disk read/write,
+overhead is negligible even in worst case workloads.  Basically
+cleancache replaces I/O with memory-copy-CPU-overhead; on older
+single-core systems with slow memory-copy speeds, cleancache
+has little value, but in newer multicore machines, especially
+consolidated/virtualized machines, it has great value.
+
+6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+
+Filesystems that are well-behaved and conform to certain
+restrictions can utilize cleancache simply by making a call to
+cleancache_init_fs at mount time.  Unusual, misbehaving, or
+poorly layered filesystems must either add additional hooks
+and/or undergo extensive additional testing... or should just
+not enable the optional cleancache.
+
+Some points for a filesystem to consider:
+
+- The FS should be block-device-based (e.g. a ram-based FS such
+  as tmpfs should not enable cleancache)
+- To ensure coherency/correctness, the FS must ensure that all
+  file removal or truncation operations either go through VFS or
+  add hooks to do the equivalent cleancache "flush" operations
+- To ensure coherency/correctness, either inode numbers must
+  be unique across the lifetime of the on-disk file OR the
+  FS must provide an "encode_fh" function.
+- The FS must call the VFS superblock alloc and deactivate routines
+  or add hooks to do the equivalent cleancache calls done there.
+- To maximize performance, all pages fetched from the FS should
+  go through the do_mpag_readpage routine or the FS should add
+  hooks to do the equivalent (cf. btrfs)
+- Currently, the FS blocksize must be the same as PAGESIZE.  This
+  is not an architectural restriction, but no backends currently
+  support anything different.
+- A clustered FS should invoke the "shared_init_fs" cleancache
+  hook to get best performance for some backends.
+
+7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+
+If cleancache would use the inode virtual address instead of
+inode/filehandle, the pool id could be eliminated.  But, this
+won't work because cleancache retains pagecache data pages
+persistently even when the inode has been pruned from the
+inode unused list, and only flushes the data page if the file
+gets removed/truncated.  So if cleancache used the inode kva,
+there would be potential coherency issues if/when the inode
+kva is reused for a different file.  Alternately, if cleancache
+flushed the pages when the inode kva was freed, much of the value
+of cleancache would be lost because the cache of pages in cleanache
+is potentially much larger than the kernel pagecache and is most
+useful if the pages survive inode cache removal.
+
+8) Why is a global variable required?
+
+The cleancache_enabled flag is checked in all of the frequently-used
+cleancache hooks.  The alternative is a function call to check a static
+variable. Since cleancache is enabled dynamically at runtime, systems
+that don't enable cleancache would suffer thousands (possibly
+tens-of-thousands) of unnecessary function calls per second.  So the
+global variable allows cleancache to be enabled by default at compile
+time, but have insignificant performance impact when cleancache remains
+disabled at runtime.
+
+9) Does cleanache work with KVM?
+
+The memory model of KVM is sufficiently different that a cleancache
+backend may have less value for KVM.  This remains to be tested,
+especially in an overcommitted system.
+
+10) Does cleancache work in userspace?  It sounds useful for
+   memory hungry caches like web browsers.  (Jamie Lokier)
+
+No plans yet, though we agree it sounds useful, at least for
+apps that bypass the page cache (e.g. O_DIRECT).
+
+Last updated: Dan Magenheimer, April 13 2011
diff --git a/MAINTAINERS b/MAINTAINERS
index b75366b..21a871c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3574,9 +3574,16 @@ M:	Andrew Morton <akpm@linux-foundation.org>
 M:	Jan Kara <jack@suse.cz>
 L:	linux-ext4@vger.kernel.org
 S:	Maintained
-F:	fs/jbd*/
-F:	include/linux/ext*jbd*.h
-F:	include/linux/jbd*.h
+F:	fs/jbd/
+F:	include/linux/ext3_jbd.h
+F:	include/linux/jbd.h
+
+JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
+M:	"Theodore Ts'o" <tytso@mit.edu>
+L:	linux-ext4@vger.kernel.org
+S:	Maintained
+F:	fs/jbd2/
+F:	include/linux/jbd2.h
 
 JSM Neo PCI based serial card
 M:	Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 076db52..d5f00d7 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -21,58 +21,22 @@ CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_ARCH_OMAP=y
-CONFIG_ARCH_OMAP2=y
-CONFIG_ARCH_OMAP3=y
-CONFIG_ARCH_OMAP4=y
 CONFIG_OMAP_RESET_CLOCKS=y
 CONFIG_OMAP_MUX_DEBUG=y
-CONFIG_OMAP_32K_TIMER=y
-CONFIG_MACH_OMAP_GENERIC=y
-CONFIG_ARCH_OMAP2420=y
-CONFIG_ARCH_OMAP2430=y
-CONFIG_ARCH_OMAP3430=y
-CONFIG_MACH_OMAP_H4=y
-CONFIG_MACH_OMAP_APOLLON=y
-CONFIG_MACH_OMAP_2430SDP=y
-CONFIG_MACH_OMAP3_BEAGLE=y
-CONFIG_MACH_DEVKIT8000=y
-CONFIG_MACH_OMAP_LDP=y
-CONFIG_MACH_OVERO=y
-CONFIG_MACH_OMAP3EVM=y
-CONFIG_MACH_OMAP3517EVM=y
-CONFIG_MACH_OMAP3_PANDORA=y
-CONFIG_MACH_OMAP3_TOUCHBOOK=y
-CONFIG_MACH_OMAP_3430SDP=y
-CONFIG_MACH_NOKIA_N8X0=y
-CONFIG_MACH_NOKIA_RX51=y
-CONFIG_MACH_OMAP_ZOOM2=y
-CONFIG_MACH_OMAP_ZOOM3=y
-CONFIG_MACH_CM_T35=y
-CONFIG_MACH_IGEP0020=y
-CONFIG_MACH_SBC3530=y
-CONFIG_MACH_OMAP_3630SDP=y
-CONFIG_MACH_OMAP_4430SDP=y
 CONFIG_ARM_THUMBEE=y
-CONFIG_ARM_L1_CACHE_SHIFT=5
 CONFIG_ARM_ERRATA_411920=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
-# CONFIG_LOCAL_TIMERS is not set
-CONFIG_AEABI=y
 CONFIG_LEDS=y
 CONFIG_ZBOOT_ROM_TEXT=0x0
 CONFIG_ZBOOT_ROM_BSS=0x0
 CONFIG_CMDLINE="root=/dev/mmcblk0p2 rootwait console=ttyO2,115200"
 CONFIG_KEXEC=y
 CONFIG_FPE_NWFPE=y
-CONFIG_VFP=y
-CONFIG_NEON=y
 CONFIG_BINFMT_MISC=y
-CONFIG_PM=y
 CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -89,14 +53,6 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_IPV6 is not set
 CONFIG_NETFILTER=y
 CONFIG_BT=m
-CONFIG_BT_L2CAP=m
-CONFIG_BT_SCO=m
-CONFIG_BT_RFCOMM=y
-CONFIG_BT_RFCOMM_TTY=y
-CONFIG_BT_BNEP=m
-CONFIG_BT_BNEP_MC_FILTER=y
-CONFIG_BT_BNEP_PROTO_FILTER=y
-CONFIG_BT_HIDP=m
 CONFIG_BT_HCIUART=m
 CONFIG_BT_HCIUART_H4=y
 CONFIG_BT_HCIUART_BCSP=y
@@ -107,11 +63,9 @@ CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_MAC80211_RC_PID=y
 CONFIG_MAC80211_RC_DEFAULT_PID=y
-CONFIG_MAC80211_LEDS=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CONNECTOR=y
 CONFIG_MTD=y
-CONFIG_MTD_CONCAT=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_CHAR=y
 CONFIG_MTD_BLOCK=y
@@ -127,7 +81,6 @@ CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=16384
-CONFIG_EEPROM_LEGACY=y
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_MULTI_LUN=y
@@ -158,19 +111,15 @@ CONFIG_TOUCHSCREEN_ADS7846=y
 CONFIG_INPUT_MISC=y
 CONFIG_INPUT_TWL4030_PWRBUTTON=y
 CONFIG_VT_HW_CONSOLE_BINDING=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_8250_NR_UARTS=32
 CONFIG_SERIAL_8250_EXTENDED=y
 CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_SERIAL_8250_DETECT_IRQ=y
 CONFIG_SERIAL_8250_RSA=y
-# CONFIG_LEGACY_PTYS is not set
 CONFIG_HW_RANDOM=y
-CONFIG_I2C=y
 CONFIG_I2C_CHARDEV=y
-CONFIG_I2C_OMAP=y
 CONFIG_SPI=y
 CONFIG_SPI_OMAP24XX=y
 CONFIG_DEBUG_GPIO=y
@@ -181,10 +130,6 @@ CONFIG_POWER_SUPPLY=y
 CONFIG_WATCHDOG=y
 CONFIG_OMAP_WATCHDOG=y
 CONFIG_TWL4030_WATCHDOG=y
-CONFIG_MENELAUS=y
-CONFIG_TWL4030_CORE=y
-CONFIG_TWL4030_POWER=y
-CONFIG_REGULATOR=y
 CONFIG_REGULATOR_TWL4030=y
 CONFIG_REGULATOR_TPS65023=y
 CONFIG_REGULATOR_TPS6507X=y
@@ -208,7 +153,6 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=y
 CONFIG_DISPLAY_SUPPORT=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
 CONFIG_FONTS=y
@@ -217,25 +161,20 @@ CONFIG_FONT_8x16=y
 CONFIG_LOGO=y
 CONFIG_SOUND=m
 CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
 CONFIG_SND_VERBOSE_PRINTK=y
 CONFIG_SND_DEBUG=y
-CONFIG_SND_USB_AUDIO=y
-CONFIG_SND_SOC=y
-CONFIG_SND_OMAP_SOC=y
-CONFIG_SND_OMAP_SOC_OMAP3_PANDORA=y
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_SOC=m
+CONFIG_SND_OMAP_SOC=m
+CONFIG_SND_OMAP_SOC_OMAP3_PANDORA=m
 CONFIG_USB=y
 CONFIG_USB_DEBUG=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_DEVICEFS=y
 CONFIG_USB_SUSPEND=y
-# CONFIG_USB_OTG_WHITELIST is not set
 CONFIG_USB_MON=y
-# CONFIG_USB_MUSB_HDRC is not set
-# CONFIG_USB_MUSB_OTG is not set
-# CONFIG_USB_GADGET_MUSB_HDRC is not set
-CONFIG_USB_MUSB_DEBUG=y
 CONFIG_USB_WDM=y
 CONFIG_USB_STORAGE=y
 CONFIG_USB_LIBUSUAL=y
@@ -250,18 +189,12 @@ CONFIG_MMC_UNSAFE_RESUME=y
 CONFIG_SDIO_UART=y
 CONFIG_MMC_OMAP=y
 CONFIG_MMC_OMAP_HS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_TWL92330=y
 CONFIG_RTC_DRV_TWL4030=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_FS_XATTR is not set
-CONFIG_INOTIFY=y
 CONFIG_QUOTA=y
 CONFIG_QFMT_V2=y
 CONFIG_MSDOS_FS=y
@@ -285,12 +218,10 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_PRINTK_TIME=y
 CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
 CONFIG_PROVE_LOCKING=y
-# CONFIG_LOCK_STAT is not set
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 CONFIG_DEBUG_INFO=y
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig
index b997a358..19d5891 100644
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -288,6 +288,7 @@ config MACH_IGEP0030
 	depends on ARCH_OMAP3
 	default y
 	select OMAP_PACKAGE_CBB
+	select MACH_IGEP0020
 
 config MACH_SBC3530
 	bool "OMAP3 SBC STALKER board"
diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 66dfbcc..b148077 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -229,8 +229,6 @@ obj-$(CONFIG_MACH_CM_T35)		+= board-cm-t35.o \
 obj-$(CONFIG_MACH_CM_T3517)		+= board-cm-t3517.o
 obj-$(CONFIG_MACH_IGEP0020)		+= board-igep0020.o \
 					   hsmmc.o
-obj-$(CONFIG_MACH_IGEP0030)		+= board-igep0030.o \
-					   hsmmc.o
 obj-$(CONFIG_MACH_OMAP3_TOUCHBOOK)	+= board-omap3touchbook.o \
 					   hsmmc.o
 obj-$(CONFIG_MACH_OMAP_4430SDP)		+= board-4430sdp.o \
@@ -270,3 +268,5 @@ obj-$(CONFIG_ARCH_OMAP4)		+= hwspinlock.o
 
 disp-$(CONFIG_OMAP2_DSS)		:= display.o
 obj-y					+= $(disp-m) $(disp-y)
+
+obj-y					+= common-board-devices.o
diff --git a/arch/arm/mach-omap2/board-2430sdp.c b/arch/arm/mach-omap2/board-2430sdp.c
index 1fa6bb8..d54969b 100644
--- a/arch/arm/mach-omap2/board-2430sdp.c
+++ b/arch/arm/mach-omap2/board-2430sdp.c
@@ -41,6 +41,7 @@
 
 #include "mux.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define SDP2430_CS0_BASE	0x04000000
 #define SECONDARY_LCD_GPIO		147
@@ -180,15 +181,6 @@ static struct twl4030_platform_data sdp2430_twldata = {
 	.vmmc1		= &sdp2430_vmmc1,
 };
 
-static struct i2c_board_info __initdata sdp2430_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_24XX_SYS_NIRQ,
-		.platform_data = &sdp2430_twldata,
-	},
-};
-
 static struct i2c_board_info __initdata sdp2430_i2c1_boardinfo[] = {
 	{
 		I2C_BOARD_INFO("isp1301_omap", 0x2D),
@@ -201,8 +193,7 @@ static int __init omap2430_i2c_init(void)
 {
 	omap_register_i2c_bus(1, 100, sdp2430_i2c1_boardinfo,
 			ARRAY_SIZE(sdp2430_i2c1_boardinfo));
-	omap_register_i2c_bus(2, 2600, sdp2430_i2c_boardinfo,
-			ARRAY_SIZE(sdp2430_i2c_boardinfo));
+	omap2_pmic_init("twl4030", &sdp2430_twldata);
 	return 0;
 }
 
@@ -217,11 +208,6 @@ static struct omap2_hsmmc_info mmc[] __initdata = {
 	{}	/* Terminator */
 };
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
 static struct omap_usb_config sdp2430_usb_config __initdata = {
 	.otg		= 1,
 #ifdef  CONFIG_USB_GADGET_OMAP
@@ -240,8 +226,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 
 static void __init omap_2430sdp_init(void)
 {
-	int ret;
-
 	omap2430_mux_init(board_mux, OMAP_PACKAGE_ZAC);
 
 	omap_board_config = sdp2430_config;
@@ -255,14 +239,13 @@ static void __init omap_2430sdp_init(void)
 	omap2_usbfs_init(&sdp2430_usb_config);
 
 	omap_mux_init_signal("usb0hs_stp", OMAP_PULL_ENA | OMAP_PULL_UP);
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 
 	board_smc91x_init();
 
 	/* Turn off secondary LCD backlight */
-	ret = gpio_request(SECONDARY_LCD_GPIO, "Secondary LCD backlight");
-	if (ret == 0)
-		gpio_direction_output(SECONDARY_LCD_GPIO, 0);
+	gpio_request_one(SECONDARY_LCD_GPIO, GPIOF_OUT_INIT_LOW,
+			 "Secondary LCD backlight");
 }
 
 static void __init omap_2430sdp_map_io(void)
diff --git a/arch/arm/mach-omap2/board-3430sdp.c b/arch/arm/mach-omap2/board-3430sdp.c
index 23244cd..ae2963a 100644
--- a/arch/arm/mach-omap2/board-3430sdp.c
+++ b/arch/arm/mach-omap2/board-3430sdp.c
@@ -19,7 +19,6 @@
 #include <linux/input.h>
 #include <linux/input/matrix_keypad.h>
 #include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
 #include <linux/i2c/twl.h>
 #include <linux/regulator/machine.h>
 #include <linux/io.h>
@@ -48,6 +47,7 @@
 #include "hsmmc.h"
 #include "pm.h"
 #include "control.h"
+#include "common-board-devices.h"
 
 #define CONFIG_DISABLE_HFCLK 1
 
@@ -59,24 +59,6 @@
 
 #define TWL4030_MSECURE_GPIO 22
 
-/* FIXME: These values need to be updated based on more profiling on 3430sdp*/
-static struct cpuidle_params omap3_cpuidle_params_table[] = {
-	/* C1 */
-	{1, 2, 2, 5},
-	/* C2 */
-	{1, 10, 10, 30},
-	/* C3 */
-	{1, 50, 50, 300},
-	/* C4 */
-	{1, 1500, 1800, 4000},
-	/* C5 */
-	{1, 2500, 7500, 12000},
-	/* C6 */
-	{1, 3000, 8500, 15000},
-	/* C7 */
-	{1, 10000, 30000, 300000},
-};
-
 static uint32_t board_keymap[] = {
 	KEY(0, 0, KEY_LEFT),
 	KEY(0, 1, KEY_RIGHT),
@@ -123,63 +105,14 @@ static struct twl4030_keypad_data sdp3430_kp_data = {
 	.rep		= 1,
 };
 
-static int ts_gpio;	/* Needed for ads7846_get_pendown_state */
-
-/**
- * @brief ads7846_dev_init : Requests & sets GPIO line for pen-irq
- *
- * @return - void. If request gpio fails then Flag KERN_ERR.
- */
-static void ads7846_dev_init(void)
-{
-	if (gpio_request(ts_gpio, "ADS7846 pendown") < 0) {
-		printk(KERN_ERR "can't get ads746 pen down GPIO\n");
-		return;
-	}
-
-	gpio_direction_input(ts_gpio);
-	gpio_set_debounce(ts_gpio, 310);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(ts_gpio);
-}
-
-static struct ads7846_platform_data tsc2046_config __initdata = {
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-	.wakeup				= true,
-};
-
-
-static struct omap2_mcspi_device_config tsc2046_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info sdp3430_spi_board_info[] __initdata = {
-	[0] = {
-		/*
-		 * TSC2046 operates at a max freqency of 2MHz, so
-		 * operate slightly below at 1.5MHz
-		 */
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &tsc2046_mcspi_config,
-		.irq			= 0,
-		.platform_data		= &tsc2046_config,
-	},
-};
-
-
 #define SDP3430_LCD_PANEL_BACKLIGHT_GPIO	8
 #define SDP3430_LCD_PANEL_ENABLE_GPIO		5
 
-static unsigned backlight_gpio;
-static unsigned enable_gpio;
+static struct gpio sdp3430_dss_gpios[] __initdata = {
+	{SDP3430_LCD_PANEL_ENABLE_GPIO,    GPIOF_OUT_INIT_LOW, "LCD reset"    },
+	{SDP3430_LCD_PANEL_BACKLIGHT_GPIO, GPIOF_OUT_INIT_LOW, "LCD Backlight"},
+};
+
 static int lcd_enabled;
 static int dvi_enabled;
 
@@ -187,29 +120,11 @@ static void __init sdp3430_display_init(void)
 {
 	int r;
 
-	enable_gpio    = SDP3430_LCD_PANEL_ENABLE_GPIO;
-	backlight_gpio = SDP3430_LCD_PANEL_BACKLIGHT_GPIO;
-
-	r = gpio_request(enable_gpio, "LCD reset");
-	if (r) {
-		printk(KERN_ERR "failed to get LCD reset GPIO\n");
-		goto err0;
-	}
-
-	r = gpio_request(backlight_gpio, "LCD Backlight");
-	if (r) {
-		printk(KERN_ERR "failed to get LCD backlight GPIO\n");
-		goto err1;
-	}
-
-	gpio_direction_output(enable_gpio, 0);
-	gpio_direction_output(backlight_gpio, 0);
+	r = gpio_request_array(sdp3430_dss_gpios,
+			       ARRAY_SIZE(sdp3430_dss_gpios));
+	if (r)
+		printk(KERN_ERR "failed to get LCD control GPIOs\n");
 
-	return;
-err1:
-	gpio_free(enable_gpio);
-err0:
-	return;
 }
 
 static int sdp3430_panel_enable_lcd(struct omap_dss_device *dssdev)
@@ -219,8 +134,8 @@ static int sdp3430_panel_enable_lcd(struct omap_dss_device *dssdev)
 		return -EINVAL;
 	}
 
-	gpio_direction_output(enable_gpio, 1);
-	gpio_direction_output(backlight_gpio, 1);
+	gpio_direction_output(SDP3430_LCD_PANEL_ENABLE_GPIO, 1);
+	gpio_direction_output(SDP3430_LCD_PANEL_BACKLIGHT_GPIO, 1);
 
 	lcd_enabled = 1;
 
@@ -231,8 +146,8 @@ static void sdp3430_panel_disable_lcd(struct omap_dss_device *dssdev)
 {
 	lcd_enabled = 0;
 
-	gpio_direction_output(enable_gpio, 0);
-	gpio_direction_output(backlight_gpio, 0);
+	gpio_direction_output(SDP3430_LCD_PANEL_ENABLE_GPIO, 0);
+	gpio_direction_output(SDP3430_LCD_PANEL_BACKLIGHT_GPIO, 0);
 }
 
 static int sdp3430_panel_enable_dvi(struct omap_dss_device *dssdev)
@@ -360,12 +275,10 @@ static int sdp3430_twl_gpio_setup(struct device *dev,
 	omap2_hsmmc_init(mmc);
 
 	/* gpio + 7 is "sub_lcd_en_bkl" (output/PWM1) */
-	gpio_request(gpio + 7, "sub_lcd_en_bkl");
-	gpio_direction_output(gpio + 7, 0);
+	gpio_request_one(gpio + 7, GPIOF_OUT_INIT_LOW, "sub_lcd_en_bkl");
 
 	/* gpio + 15 is "sub_lcd_nRST" (output) */
-	gpio_request(gpio + 15, "sub_lcd_nRST");
-	gpio_direction_output(gpio + 15, 0);
+	gpio_request_one(gpio + 15, GPIOF_OUT_INIT_LOW, "sub_lcd_nRST");
 
 	return 0;
 }
@@ -580,20 +493,10 @@ static struct twl4030_platform_data sdp3430_twldata = {
 	.vpll2		= &sdp3430_vpll2,
 };
 
-static struct i2c_board_info __initdata sdp3430_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &sdp3430_twldata,
-	},
-};
-
 static int __init omap3430_i2c_init(void)
 {
 	/* i2c1 for PMIC only */
-	omap_register_i2c_bus(1, 2600, sdp3430_i2c_boardinfo,
-			ARRAY_SIZE(sdp3430_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &sdp3430_twldata);
 	/* i2c2 on camera connector (for sensor control) and optional isp1301 */
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	/* i2c3 on display connector (for DVI, tfp410) */
@@ -872,30 +775,22 @@ static struct flash_partitions sdp_flash_partitions[] = {
 	},
 };
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static void __init omap_3430sdp_init(void)
 {
+	int gpio_pendown;
+
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
 	omap_board_config = sdp3430_config;
 	omap_board_config_size = ARRAY_SIZE(sdp3430_config);
-	omap3_pm_init_cpuidle(omap3_cpuidle_params_table);
 	omap3430_i2c_init();
 	omap_display_init(&sdp3430_dss_data);
 	if (omap_rev() > OMAP3430_REV_ES1_0)
-		ts_gpio = SDP3430_TS_GPIO_IRQ_SDPV2;
+		gpio_pendown = SDP3430_TS_GPIO_IRQ_SDPV2;
 	else
-		ts_gpio = SDP3430_TS_GPIO_IRQ_SDPV1;
-	sdp3430_spi_board_info[0].irq = gpio_to_irq(ts_gpio);
-	spi_register_board_info(sdp3430_spi_board_info,
-				ARRAY_SIZE(sdp3430_spi_board_info));
-	ads7846_dev_init();
+		gpio_pendown = SDP3430_TS_GPIO_IRQ_SDPV1;
+	omap_ads7846_init(1, gpio_pendown, 310, NULL);
 	board_serial_init();
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	board_smc91x_init();
 	board_flash_init(sdp_flash_partitions, chip_sel_3430, 0);
 	sdp3430_display_init();
diff --git a/arch/arm/mach-omap2/board-4430sdp.c b/arch/arm/mach-omap2/board-4430sdp.c
index 93edd7f..73fa90b 100644
--- a/arch/arm/mach-omap2/board-4430sdp.c
+++ b/arch/arm/mach-omap2/board-4430sdp.c
@@ -42,6 +42,7 @@
 #include "hsmmc.h"
 #include "timer-gp.h"
 #include "control.h"
+#include "common-board-devices.h"
 
 #define ETH_KS8851_IRQ			34
 #define ETH_KS8851_POWER_ON		48
@@ -251,58 +252,22 @@ static struct spi_board_info sdp4430_spi_board_info[] __initdata = {
 	},
 };
 
+static struct gpio sdp4430_eth_gpios[] __initdata = {
+	{ ETH_KS8851_POWER_ON,	GPIOF_OUT_INIT_HIGH,	"eth_power"	},
+	{ ETH_KS8851_QUART,	GPIOF_OUT_INIT_HIGH,	"quart"		},
+	{ ETH_KS8851_IRQ,	GPIOF_IN,		"eth_irq"	},
+};
+
 static int omap_ethernet_init(void)
 {
 	int status;
 
 	/* Request of GPIO lines */
+	status = gpio_request_array(sdp4430_eth_gpios,
+				    ARRAY_SIZE(sdp4430_eth_gpios));
+	if (status)
+		pr_err("Cannot request ETH GPIOs\n");
 
-	status = gpio_request(ETH_KS8851_POWER_ON, "eth_power");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", ETH_KS8851_POWER_ON);
-		return status;
-	}
-
-	status = gpio_request(ETH_KS8851_QUART, "quart");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", ETH_KS8851_QUART);
-		goto error1;
-	}
-
-	status = gpio_request(ETH_KS8851_IRQ, "eth_irq");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", ETH_KS8851_IRQ);
-		goto error2;
-	}
-
-	/* Configuration of requested GPIO lines */
-
-	status = gpio_direction_output(ETH_KS8851_POWER_ON, 1);
-	if (status) {
-		pr_err("Cannot set output GPIO %d\n", ETH_KS8851_IRQ);
-		goto error3;
-	}
-
-	status = gpio_direction_output(ETH_KS8851_QUART, 1);
-	if (status) {
-		pr_err("Cannot set output GPIO %d\n", ETH_KS8851_QUART);
-		goto error3;
-	}
-
-	status = gpio_direction_input(ETH_KS8851_IRQ);
-	if (status) {
-		pr_err("Cannot set input GPIO %d\n", ETH_KS8851_IRQ);
-		goto error3;
-	}
-
-	return 0;
-
-error3:
-	gpio_free(ETH_KS8851_IRQ);
-error2:
-	gpio_free(ETH_KS8851_QUART);
-error1:
-	gpio_free(ETH_KS8851_POWER_ON);
 	return status;
 }
 
@@ -575,14 +540,6 @@ static struct twl4030_platform_data sdp4430_twldata = {
 	.usb		= &omap4_usbphy_data
 };
 
-static struct i2c_board_info __initdata sdp4430_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl6030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = OMAP44XX_IRQ_SYS_1N,
-		.platform_data = &sdp4430_twldata,
-	},
-};
 static struct i2c_board_info __initdata sdp4430_i2c_3_boardinfo[] = {
 	{
 		I2C_BOARD_INFO("tmp105", 0x48),
@@ -598,12 +555,7 @@ static struct i2c_board_info __initdata sdp4430_i2c_4_boardinfo[] = {
 };
 static int __init omap4_i2c_init(void)
 {
-	/*
-	 * Phoenix Audio IC needs I2C1 to
-	 * start with 400 KHz or less
-	 */
-	omap_register_i2c_bus(1, 400, sdp4430_i2c_boardinfo,
-			ARRAY_SIZE(sdp4430_i2c_boardinfo));
+	omap4_pmic_init("twl6030", &sdp4430_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, sdp4430_i2c_3_boardinfo,
 				ARRAY_SIZE(sdp4430_i2c_3_boardinfo));
@@ -614,21 +566,13 @@ static int __init omap4_i2c_init(void)
 
 static void __init omap_sfh7741prox_init(void)
 {
-	int  error;
+	int error;
 
-	error = gpio_request(OMAP4_SFH7741_ENABLE_GPIO, "sfh7741");
-	if (error < 0) {
+	error = gpio_request_one(OMAP4_SFH7741_ENABLE_GPIO,
+				 GPIOF_OUT_INIT_LOW, "sfh7741");
+	if (error < 0)
 		pr_err("%s:failed to request GPIO %d, error %d\n",
 			__func__, OMAP4_SFH7741_ENABLE_GPIO, error);
-		return;
-	}
-
-	error = gpio_direction_output(OMAP4_SFH7741_ENABLE_GPIO , 0);
-	if (error < 0) {
-		pr_err("%s: GPIO configuration failed: GPIO %d,error %d\n",
-			 __func__, OMAP4_SFH7741_ENABLE_GPIO, error);
-		gpio_free(OMAP4_SFH7741_ENABLE_GPIO);
-	}
 }
 
 static void sdp4430_hdmi_mux_init(void)
@@ -645,27 +589,19 @@ static void sdp4430_hdmi_mux_init(void)
 			OMAP_PIN_INPUT_PULLUP);
 }
 
+static struct gpio sdp4430_hdmi_gpios[] = {
+	{ HDMI_GPIO_HPD,	GPIOF_OUT_INIT_HIGH,	"hdmi_gpio_hpd"   },
+	{ HDMI_GPIO_LS_OE,	GPIOF_OUT_INIT_HIGH,	"hdmi_gpio_ls_oe" },
+};
+
 static int sdp4430_panel_enable_hdmi(struct omap_dss_device *dssdev)
 {
 	int status;
 
-	status = gpio_request_one(HDMI_GPIO_HPD, GPIOF_OUT_INIT_HIGH,
-							"hdmi_gpio_hpd");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", HDMI_GPIO_HPD);
-		return status;
-	}
-	status = gpio_request_one(HDMI_GPIO_LS_OE, GPIOF_OUT_INIT_HIGH,
-							"hdmi_gpio_ls_oe");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", HDMI_GPIO_LS_OE);
-		goto error1;
-	}
-
-	return 0;
-
-error1:
-	gpio_free(HDMI_GPIO_HPD);
+	status = gpio_request_array(sdp4430_hdmi_gpios,
+				    ARRAY_SIZE(sdp4430_hdmi_gpios));
+	if (status)
+		pr_err("%s: Cannot request HDMI GPIOs\n", __func__);
 
 	return status;
 }
diff --git a/arch/arm/mach-omap2/board-am3517crane.c b/arch/arm/mach-omap2/board-am3517crane.c
index a890d24..5e438a7 100644
--- a/arch/arm/mach-omap2/board-am3517crane.c
+++ b/arch/arm/mach-omap2/board-am3517crane.c
@@ -89,19 +89,13 @@ static void __init am3517_crane_init(void)
 		return;
 	}
 
-	ret = gpio_request(GPIO_USB_POWER, "usb_ehci_enable");
+	ret = gpio_request_one(GPIO_USB_POWER, GPIOF_OUT_INIT_HIGH,
+			       "usb_ehci_enable");
 	if (ret < 0) {
 		pr_err("Can not request GPIO %d\n", GPIO_USB_POWER);
 		return;
 	}
 
-	ret = gpio_direction_output(GPIO_USB_POWER, 1);
-	if (ret < 0) {
-		gpio_free(GPIO_USB_POWER);
-		pr_err("Unable to initialize EHCI power\n");
-		return;
-	}
-
 	usbhs_init(&usbhs_bdata);
 }
 
diff --git a/arch/arm/mach-omap2/board-am3517evm.c b/arch/arm/mach-omap2/board-am3517evm.c
index ff8c59b..63af417 100644
--- a/arch/arm/mach-omap2/board-am3517evm.c
+++ b/arch/arm/mach-omap2/board-am3517evm.c
@@ -174,19 +174,14 @@ static void __init am3517_evm_rtc_init(void)
 	int r;
 
 	omap_mux_init_gpio(GPIO_RTCS35390A_IRQ, OMAP_PIN_INPUT_PULLUP);
-	r = gpio_request(GPIO_RTCS35390A_IRQ, "rtcs35390a-irq");
+
+	r = gpio_request_one(GPIO_RTCS35390A_IRQ, GPIOF_IN, "rtcs35390a-irq");
 	if (r < 0) {
 		printk(KERN_WARNING "failed to request GPIO#%d\n",
 				GPIO_RTCS35390A_IRQ);
 		return;
 	}
-	r = gpio_direction_input(GPIO_RTCS35390A_IRQ);
-	if (r < 0) {
-		printk(KERN_WARNING "GPIO#%d cannot be configured as input\n",
-				GPIO_RTCS35390A_IRQ);
-		gpio_free(GPIO_RTCS35390A_IRQ);
-		return;
-	}
+
 	am3517evm_i2c1_boardinfo[0].irq = gpio_to_irq(GPIO_RTCS35390A_IRQ);
 }
 
@@ -242,6 +237,15 @@ static int dvi_enabled;
 
 #if defined(CONFIG_PANEL_SHARP_LQ043T1DG01) || \
 		defined(CONFIG_PANEL_SHARP_LQ043T1DG01_MODULE)
+static struct gpio am3517_evm_dss_gpios[] __initdata = {
+	/* GPIO 182 = LCD Backlight Power */
+	{ LCD_PANEL_BKLIGHT_PWR, GPIOF_OUT_INIT_HIGH, "lcd_backlight_pwr" },
+	/* GPIO 181 = LCD Panel PWM */
+	{ LCD_PANEL_PWM,	 GPIOF_OUT_INIT_HIGH, "lcd bl enable"	  },
+	/* GPIO 176 = LCD Panel Power enable pin */
+	{ LCD_PANEL_PWR,	 GPIOF_OUT_INIT_HIGH, "dvi enable"	  },
+};
+
 static void __init am3517_evm_display_init(void)
 {
 	int r;
@@ -249,41 +253,15 @@ static void __init am3517_evm_display_init(void)
 	omap_mux_init_gpio(LCD_PANEL_PWR, OMAP_PIN_INPUT_PULLUP);
 	omap_mux_init_gpio(LCD_PANEL_BKLIGHT_PWR, OMAP_PIN_INPUT_PULLDOWN);
 	omap_mux_init_gpio(LCD_PANEL_PWM, OMAP_PIN_INPUT_PULLDOWN);
-	/*
-	 * Enable GPIO 182 = LCD Backlight Power
-	 */
-	r = gpio_request(LCD_PANEL_BKLIGHT_PWR, "lcd_backlight_pwr");
+
+	r = gpio_request_array(am3517_evm_dss_gpios,
+			       ARRAY_SIZE(am3517_evm_dss_gpios));
 	if (r) {
-		printk(KERN_ERR "failed to get lcd_backlight_pwr\n");
+		printk(KERN_ERR "failed to get DSS panel control GPIOs\n");
 		return;
 	}
-	gpio_direction_output(LCD_PANEL_BKLIGHT_PWR, 1);
-	/*
-	 * Enable GPIO 181 = LCD Panel PWM
-	 */
-	r = gpio_request(LCD_PANEL_PWM, "lcd_pwm");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_pwm\n");
-		goto err_1;
-	}
-	gpio_direction_output(LCD_PANEL_PWM, 1);
-	/*
-	 * Enable GPIO 176 = LCD Panel Power enable pin
-	 */
-	r = gpio_request(LCD_PANEL_PWR, "lcd_panel_pwr");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_pwr\n");
-		goto err_2;
-	}
-	gpio_direction_output(LCD_PANEL_PWR, 1);
 
 	printk(KERN_INFO "Display initialized successfully\n");
-	return;
-
-err_2:
-	gpio_free(LCD_PANEL_PWM);
-err_1:
-	gpio_free(LCD_PANEL_BKLIGHT_PWR);
 }
 #else
 static void __init am3517_evm_display_init(void) {}
@@ -396,7 +374,7 @@ static struct omap_musb_board_data musb_board_data = {
 	.power                  = 500,
 	.set_phy_power		= am35x_musb_phy_power,
 	.clear_irq		= am35x_musb_clear_irq,
-	.set_mode		= am35x_musb_set_mode,
+	.set_mode		= am35x_set_mode,
 	.reset			= am35x_musb_reset,
 };
 
diff --git a/arch/arm/mach-omap2/board-apollon.c b/arch/arm/mach-omap2/board-apollon.c
index f4f8374..f3beb8e 100644
--- a/arch/arm/mach-omap2/board-apollon.c
+++ b/arch/arm/mach-omap2/board-apollon.c
@@ -202,6 +202,7 @@ static inline void __init apollon_init_smc91x(void)
 	unsigned int rate;
 	struct clk *gpmc_fck;
 	int eth_cs;
+	int err;
 
 	gpmc_fck = clk_get(NULL, "gpmc_fck");	/* Always on ENABLE_ON_INIT */
 	if (IS_ERR(gpmc_fck)) {
@@ -245,15 +246,13 @@ static inline void __init apollon_init_smc91x(void)
 	apollon_smc91x_resources[0].end   = base + 0x30f;
 	udelay(100);
 
-	omap_mux_init_gpio(74, 0);
-	if (gpio_request(APOLLON_ETHR_GPIO_IRQ, "SMC91x irq") < 0) {
+	omap_mux_init_gpio(APOLLON_ETHR_GPIO_IRQ, 0);
+	err = gpio_request_one(APOLLON_ETHR_GPIO_IRQ, GPIOF_IN, "SMC91x irq");
+	if (err) {
 		printk(KERN_ERR "Failed to request GPIO%d for smc91x IRQ\n",
 			APOLLON_ETHR_GPIO_IRQ);
 		gpmc_cs_free(APOLLON_ETH_CS);
-		goto out;
 	}
-	gpio_direction_input(APOLLON_ETHR_GPIO_IRQ);
-
 out:
 	clk_disable(gpmc_fck);
 	clk_put(gpmc_fck);
@@ -280,20 +279,19 @@ static void __init omap_apollon_init_early(void)
 	omap2_init_common_devices(NULL, NULL);
 }
 
+static struct gpio apollon_gpio_leds[] __initdata = {
+	{ LED0_GPIO13, GPIOF_OUT_INIT_LOW, "LED0" }, /* LED0 - AA10 */
+	{ LED1_GPIO14, GPIOF_OUT_INIT_LOW, "LED1" }, /* LED1 - AA6  */
+	{ LED2_GPIO15, GPIOF_OUT_INIT_LOW, "LED2" }, /* LED2 - AA4  */
+};
+
 static void __init apollon_led_init(void)
 {
-	/* LED0 - AA10 */
 	omap_mux_init_signal("vlynq_clk.gpio_13", 0);
-	gpio_request(LED0_GPIO13, "LED0");
-	gpio_direction_output(LED0_GPIO13, 0);
-	/* LED1  - AA6 */
 	omap_mux_init_signal("vlynq_rx1.gpio_14", 0);
-	gpio_request(LED1_GPIO14, "LED1");
-	gpio_direction_output(LED1_GPIO14, 0);
-	/* LED2  - AA4 */
 	omap_mux_init_signal("vlynq_rx0.gpio_15", 0);
-	gpio_request(LED2_GPIO15, "LED2");
-	gpio_direction_output(LED2_GPIO15, 0);
+
+	gpio_request_array(apollon_gpio_leds, ARRAY_SIZE(apollon_gpio_leds));
 }
 
 static void __init apollon_usb_init(void)
@@ -301,8 +299,7 @@ static void __init apollon_usb_init(void)
 	/* USB device */
 	/* DEVICE_SUSPEND */
 	omap_mux_init_signal("mcbsp2_clkx.gpio_12", 0);
-	gpio_request(12, "USB suspend");
-	gpio_direction_output(12, 0);
+	gpio_request_one(12, GPIOF_OUT_INIT_LOW, "USB suspend");
 	omap2_usbfs_init(&apollon_usb_config);
 }
 
diff --git a/arch/arm/mach-omap2/board-cm-t35.c b/arch/arm/mach-omap2/board-cm-t35.c
index 9340f6a..c63115b 100644
--- a/arch/arm/mach-omap2/board-cm-t35.c
+++ b/arch/arm/mach-omap2/board-cm-t35.c
@@ -54,6 +54,7 @@
 #include "mux.h"
 #include "sdram-micron-mt46h32m32lf-6.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define CM_T35_GPIO_PENDOWN	57
 
@@ -66,86 +67,28 @@
 
 #if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
 #include <linux/smsc911x.h>
+#include <plat/gpmc-smsc911x.h>
 
-static struct smsc911x_platform_config cm_t35_smsc911x_config = {
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
-	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS,
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-};
-
-static struct resource cm_t35_smsc911x_resources[] = {
-	{
-		.flags	= IORESOURCE_MEM,
-	},
-	{
-		.start	= OMAP_GPIO_IRQ(CM_T35_SMSC911X_GPIO),
-		.end	= OMAP_GPIO_IRQ(CM_T35_SMSC911X_GPIO),
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct platform_device cm_t35_smsc911x_device = {
-	.name		= "smsc911x",
+static struct omap_smsc911x_platform_data cm_t35_smsc911x_cfg = {
 	.id		= 0,
-	.num_resources	= ARRAY_SIZE(cm_t35_smsc911x_resources),
-	.resource	= cm_t35_smsc911x_resources,
-	.dev		= {
-		.platform_data = &cm_t35_smsc911x_config,
-	},
-};
-
-static struct resource sb_t35_smsc911x_resources[] = {
-	{
-		.flags	= IORESOURCE_MEM,
-	},
-	{
-		.start	= OMAP_GPIO_IRQ(SB_T35_SMSC911X_GPIO),
-		.end	= OMAP_GPIO_IRQ(SB_T35_SMSC911X_GPIO),
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
+	.cs             = CM_T35_SMSC911X_CS,
+	.gpio_irq       = CM_T35_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS,
 };
 
-static struct platform_device sb_t35_smsc911x_device = {
-	.name		= "smsc911x",
+static struct omap_smsc911x_platform_data sb_t35_smsc911x_cfg = {
 	.id		= 1,
-	.num_resources	= ARRAY_SIZE(sb_t35_smsc911x_resources),
-	.resource	= sb_t35_smsc911x_resources,
-	.dev		= {
-		.platform_data = &cm_t35_smsc911x_config,
-	},
+	.cs             = SB_T35_SMSC911X_CS,
+	.gpio_irq       = SB_T35_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS,
 };
 
-static void __init cm_t35_init_smsc911x(struct platform_device *dev,
-					int cs, int irq_gpio)
-{
-	unsigned long cs_mem_base;
-
-	if (gpmc_cs_request(cs, SZ_16M, &cs_mem_base) < 0) {
-		pr_err("CM-T35: Failed request for GPMC mem for smsc911x\n");
-		return;
-	}
-
-	dev->resource[0].start = cs_mem_base + 0x0;
-	dev->resource[0].end   = cs_mem_base + 0xff;
-
-	if ((gpio_request(irq_gpio, "ETH IRQ") == 0) &&
-	    (gpio_direction_input(irq_gpio) == 0)) {
-		gpio_export(irq_gpio, 0);
-	} else {
-		pr_err("CM-T35: could not obtain gpio for SMSC911X IRQ\n");
-		return;
-	}
-
-	platform_device_register(dev);
-}
-
 static void __init cm_t35_init_ethernet(void)
 {
-	cm_t35_init_smsc911x(&cm_t35_smsc911x_device,
-			     CM_T35_SMSC911X_CS, CM_T35_SMSC911X_GPIO);
-	cm_t35_init_smsc911x(&sb_t35_smsc911x_device,
-			     SB_T35_SMSC911X_CS, SB_T35_SMSC911X_GPIO);
+	gpmc_smsc911x_init(&cm_t35_smsc911x_cfg);
+	gpmc_smsc911x_init(&sb_t35_smsc911x_cfg);
 }
 #else
 static inline void __init cm_t35_init_ethernet(void) { return; }
@@ -235,69 +178,10 @@ static void __init cm_t35_init_nand(void)
 static inline void cm_t35_init_nand(void) {}
 #endif
 
-#if defined(CONFIG_TOUCHSCREEN_ADS7846) || \
-	defined(CONFIG_TOUCHSCREEN_ADS7846_MODULE)
-#include <linux/spi/ads7846.h>
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(CM_T35_GPIO_PENDOWN);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max			= 0x0fff,
-	.y_max			= 0x0fff,
-	.x_plate_ohms		= 180,
-	.pressure_max		= 255,
-	.debounce_max		= 10,
-	.debounce_tol		= 3,
-	.debounce_rep		= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-};
-
-static struct spi_board_info cm_t35_spi_board_info[] __initdata = {
-	{
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(CM_T35_GPIO_PENDOWN),
-		.platform_data		= &ads7846_config,
-	},
-};
-
-static void __init cm_t35_init_ads7846(void)
-{
-	if ((gpio_request(CM_T35_GPIO_PENDOWN, "ADS7846_PENDOWN") == 0) &&
-	    (gpio_direction_input(CM_T35_GPIO_PENDOWN) == 0)) {
-		gpio_export(CM_T35_GPIO_PENDOWN, 0);
-	} else {
-		pr_err("CM-T35: could not obtain gpio for ADS7846_PENDOWN\n");
-		return;
-	}
-
-	spi_register_board_info(cm_t35_spi_board_info,
-				ARRAY_SIZE(cm_t35_spi_board_info));
-}
-#else
-static inline void cm_t35_init_ads7846(void) {}
-#endif
-
 #define CM_T35_LCD_EN_GPIO 157
 #define CM_T35_LCD_BL_GPIO 58
 #define CM_T35_DVI_EN_GPIO 54
 
-static int lcd_bl_gpio;
-static int lcd_en_gpio;
-static int dvi_en_gpio;
-
 static int lcd_enabled;
 static int dvi_enabled;
 
@@ -308,8 +192,8 @@ static int cm_t35_panel_enable_lcd(struct omap_dss_device *dssdev)
 		return -EINVAL;
 	}
 
-	gpio_set_value(lcd_en_gpio, 1);
-	gpio_set_value(lcd_bl_gpio, 1);
+	gpio_set_value(CM_T35_LCD_EN_GPIO, 1);
+	gpio_set_value(CM_T35_LCD_BL_GPIO, 1);
 
 	lcd_enabled = 1;
 
@@ -320,8 +204,8 @@ static void cm_t35_panel_disable_lcd(struct omap_dss_device *dssdev)
 {
 	lcd_enabled = 0;
 
-	gpio_set_value(lcd_bl_gpio, 0);
-	gpio_set_value(lcd_en_gpio, 0);
+	gpio_set_value(CM_T35_LCD_BL_GPIO, 0);
+	gpio_set_value(CM_T35_LCD_EN_GPIO, 0);
 }
 
 static int cm_t35_panel_enable_dvi(struct omap_dss_device *dssdev)
@@ -331,7 +215,7 @@ static int cm_t35_panel_enable_dvi(struct omap_dss_device *dssdev)
 		return -EINVAL;
 	}
 
-	gpio_set_value(dvi_en_gpio, 0);
+	gpio_set_value(CM_T35_DVI_EN_GPIO, 0);
 	dvi_enabled = 1;
 
 	return 0;
@@ -339,7 +223,7 @@ static int cm_t35_panel_enable_dvi(struct omap_dss_device *dssdev)
 
 static void cm_t35_panel_disable_dvi(struct omap_dss_device *dssdev)
 {
-	gpio_set_value(dvi_en_gpio, 1);
+	gpio_set_value(CM_T35_DVI_EN_GPIO, 1);
 	dvi_enabled = 0;
 }
 
@@ -421,62 +305,38 @@ static struct spi_board_info cm_t35_lcd_spi_board_info[] __initdata = {
 	},
 };
 
+static struct gpio cm_t35_dss_gpios[] __initdata = {
+	{ CM_T35_LCD_EN_GPIO, GPIOF_OUT_INIT_LOW,  "lcd enable"    },
+	{ CM_T35_LCD_BL_GPIO, GPIOF_OUT_INIT_LOW,  "lcd bl enable" },
+	{ CM_T35_DVI_EN_GPIO, GPIOF_OUT_INIT_HIGH, "dvi enable"    },
+};
+
 static void __init cm_t35_init_display(void)
 {
 	int err;
 
-	lcd_en_gpio = CM_T35_LCD_EN_GPIO;
-	lcd_bl_gpio = CM_T35_LCD_BL_GPIO;
-	dvi_en_gpio = CM_T35_DVI_EN_GPIO;
-
 	spi_register_board_info(cm_t35_lcd_spi_board_info,
 				ARRAY_SIZE(cm_t35_lcd_spi_board_info));
 
-	err = gpio_request(lcd_en_gpio, "LCD RST");
-	if (err) {
-		pr_err("CM-T35: failed to get LCD reset GPIO\n");
-		goto out;
-	}
-
-	err = gpio_request(lcd_bl_gpio, "LCD BL");
+	err = gpio_request_array(cm_t35_dss_gpios,
+				 ARRAY_SIZE(cm_t35_dss_gpios));
 	if (err) {
-		pr_err("CM-T35: failed to get LCD backlight control GPIO\n");
-		goto err_lcd_bl;
-	}
-
-	err = gpio_request(dvi_en_gpio, "DVI EN");
-	if (err) {
-		pr_err("CM-T35: failed to get DVI reset GPIO\n");
-		goto err_dvi_en;
+		pr_err("CM-T35: failed to request DSS control GPIOs\n");
+		return;
 	}
 
-	gpio_export(lcd_en_gpio, 0);
-	gpio_export(lcd_bl_gpio, 0);
-	gpio_export(dvi_en_gpio, 0);
-	gpio_direction_output(lcd_en_gpio, 0);
-	gpio_direction_output(lcd_bl_gpio, 0);
-	gpio_direction_output(dvi_en_gpio, 1);
+	gpio_export(CM_T35_LCD_EN_GPIO, 0);
+	gpio_export(CM_T35_LCD_BL_GPIO, 0);
+	gpio_export(CM_T35_DVI_EN_GPIO, 0);
 
 	msleep(50);
-	gpio_set_value(lcd_en_gpio, 1);
+	gpio_set_value(CM_T35_LCD_EN_GPIO, 1);
 
 	err = omap_display_init(&cm_t35_dss_data);
 	if (err) {
 		pr_err("CM-T35: failed to register DSS device\n");
-		goto err_dev_reg;
+		gpio_free_array(cm_t35_dss_gpios, ARRAY_SIZE(cm_t35_dss_gpios));
 	}
-
-	return;
-
-err_dev_reg:
-	gpio_free(dvi_en_gpio);
-err_dvi_en:
-	gpio_free(lcd_bl_gpio);
-err_lcd_bl:
-	gpio_free(lcd_en_gpio);
-out:
-
-	return;
 }
 
 static struct regulator_consumer_supply cm_t35_vmmc1_supply = {
@@ -609,10 +469,8 @@ static int cm_t35_twl_gpio_setup(struct device *dev, unsigned gpio,
 {
 	int wlan_rst = gpio + 2;
 
-	if ((gpio_request(wlan_rst, "WLAN RST") == 0) &&
-	    (gpio_direction_output(wlan_rst, 1) == 0)) {
+	if (gpio_request_one(wlan_rst, GPIOF_OUT_INIT_HIGH, "WLAN RST") == 0) {
 		gpio_export(wlan_rst, 0);
-
 		udelay(10);
 		gpio_set_value(wlan_rst, 0);
 		udelay(10);
@@ -653,19 +511,9 @@ static struct twl4030_platform_data cm_t35_twldata = {
 	.vpll2		= &cm_t35_vpll2,
 };
 
-static struct i2c_board_info __initdata cm_t35_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("tps65930", 0x48),
-		.flags		= I2C_CLIENT_WAKE,
-		.irq		= INT_34XX_SYS_NIRQ,
-		.platform_data	= &cm_t35_twldata,
-	},
-};
-
 static void __init cm_t35_init_i2c(void)
 {
-	omap_register_i2c_bus(1, 2600, cm_t35_i2c_boardinfo,
-			      ARRAY_SIZE(cm_t35_i2c_boardinfo));
+	omap3_pmic_init("tps65930", &cm_t35_twldata);
 }
 
 static void __init cm_t35_init_early(void)
@@ -775,12 +623,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static struct omap_board_config_kernel cm_t35_config[] __initdata = {
 };
 
@@ -792,12 +634,12 @@ static void __init cm_t35_init(void)
 	omap_serial_init();
 	cm_t35_init_i2c();
 	cm_t35_init_nand();
-	cm_t35_init_ads7846();
+	omap_ads7846_init(1, CM_T35_GPIO_PENDOWN, 0, NULL);
 	cm_t35_init_ethernet();
 	cm_t35_init_led();
 	cm_t35_init_display();
 
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
 }
 
diff --git a/arch/arm/mach-omap2/board-cm-t3517.c b/arch/arm/mach-omap2/board-cm-t3517.c
index a27e3ee..08f08e8 100644
--- a/arch/arm/mach-omap2/board-cm-t3517.c
+++ b/arch/arm/mach-omap2/board-cm-t3517.c
@@ -148,14 +148,13 @@ static void __init cm_t3517_init_rtc(void)
 {
 	int err;
 
-	err = gpio_request(RTC_CS_EN_GPIO, "rtc cs en");
+	err = gpio_request_one(RTC_CS_EN_GPIO, GPIOF_OUT_INIT_HIGH,
+			       "rtc cs en");
 	if (err) {
 		pr_err("CM-T3517: rtc cs en gpio request failed: %d\n", err);
 		return;
 	}
 
-	gpio_direction_output(RTC_CS_EN_GPIO, 1);
-
 	platform_device_register(&cm_t3517_rtc_device);
 }
 #else
@@ -182,11 +181,11 @@ static int cm_t3517_init_usbh(void)
 {
 	int err;
 
-	err = gpio_request(USB_HUB_RESET_GPIO, "usb hub rst");
+	err = gpio_request_one(USB_HUB_RESET_GPIO, GPIOF_OUT_INIT_LOW,
+			       "usb hub rst");
 	if (err) {
 		pr_err("CM-T3517: usb hub rst gpio request failed: %d\n", err);
 	} else {
-		gpio_direction_output(USB_HUB_RESET_GPIO, 0);
 		udelay(10);
 		gpio_set_value(USB_HUB_RESET_GPIO, 1);
 		msleep(1);
diff --git a/arch/arm/mach-omap2/board-devkit8000.c b/arch/arm/mach-omap2/board-devkit8000.c
index 1d1b56a..cf520d7 100644
--- a/arch/arm/mach-omap2/board-devkit8000.c
+++ b/arch/arm/mach-omap2/board-devkit8000.c
@@ -51,7 +51,6 @@
 #include <plat/mcspi.h>
 #include <linux/input/matrix_keypad.h>
 #include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
 #include <linux/dm9000.h>
 #include <linux/interrupt.h>
 
@@ -60,6 +59,7 @@
 #include "mux.h"
 #include "hsmmc.h"
 #include "timer-gp.h"
+#include "common-board-devices.h"
 
 #define NAND_BLOCK_SIZE		SZ_128K
 
@@ -97,13 +97,6 @@ static struct mtd_partition devkit8000_nand_partitions[] = {
 	},
 };
 
-static struct omap_nand_platform_data devkit8000_nand_data = {
-	.options	= NAND_BUSWIDTH_16,
-	.parts		= devkit8000_nand_partitions,
-	.nr_parts	= ARRAY_SIZE(devkit8000_nand_partitions),
-	.dma_channel	= -1,		/* disable DMA in OMAP NAND driver */
-};
-
 static struct omap2_hsmmc_info mmc[] = {
 	{
 		.mmc		= 1,
@@ -249,7 +242,7 @@ static int devkit8000_twl_gpio_setup(struct device *dev,
 	/* TWL4030_GPIO_MAX + 0 is "LCD_PWREN" (out, active high) */
 	devkit8000_lcd_device.reset_gpio = gpio + TWL4030_GPIO_MAX + 0;
 	ret = gpio_request_one(devkit8000_lcd_device.reset_gpio,
-			GPIOF_DIR_OUT | GPIOF_INIT_LOW, "LCD_PWREN");
+			       GPIOF_OUT_INIT_LOW, "LCD_PWREN");
 	if (ret < 0) {
 		devkit8000_lcd_device.reset_gpio = -EINVAL;
 		printk(KERN_ERR "Failed to request GPIO for LCD_PWRN\n");
@@ -258,7 +251,7 @@ static int devkit8000_twl_gpio_setup(struct device *dev,
 	/* gpio + 7 is "DVI_PD" (out, active low) */
 	devkit8000_dvi_device.reset_gpio = gpio + 7;
 	ret = gpio_request_one(devkit8000_dvi_device.reset_gpio,
-			GPIOF_DIR_OUT | GPIOF_INIT_LOW, "DVI PowerDown");
+			       GPIOF_OUT_INIT_LOW, "DVI PowerDown");
 	if (ret < 0) {
 		devkit8000_dvi_device.reset_gpio = -EINVAL;
 		printk(KERN_ERR "Failed to request GPIO for DVI PowerDown\n");
@@ -366,19 +359,9 @@ static struct twl4030_platform_data devkit8000_twldata = {
 	.keypad		= &devkit8000_kp_data,
 };
 
-static struct i2c_board_info __initdata devkit8000_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("tps65930", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &devkit8000_twldata,
-	},
-};
-
 static int __init devkit8000_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, devkit8000_i2c_boardinfo,
-			ARRAY_SIZE(devkit8000_i2c_boardinfo));
+	omap3_pmic_init("tps65930", &devkit8000_twldata);
 	/* Bus 3 is attached to the DVI port where devices like the pico DLP
 	 * projector don't work reliably with 400kHz */
 	omap_register_i2c_bus(3, 400, NULL, 0);
@@ -463,56 +446,6 @@ static void __init devkit8000_init_irq(void)
 #endif
 }
 
-static void __init devkit8000_ads7846_init(void)
-{
-	int gpio = OMAP3_DEVKIT_TS_GPIO;
-	int ret;
-
-	ret = gpio_request(gpio, "ads7846_pen_down");
-	if (ret < 0) {
-		printk(KERN_ERR "Failed to request GPIO %d for "
-				"ads7846 pen down IRQ\n", gpio);
-		return;
-	}
-
-	gpio_direction_input(gpio);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(OMAP3_DEVKIT_TS_GPIO);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max                  = 0x0fff,
-	.y_max                  = 0x0fff,
-	.x_plate_ohms           = 180,
-	.pressure_max           = 255,
-	.debounce_max           = 10,
-	.debounce_tol           = 5,
-	.debounce_rep           = 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-	.settle_delay_usecs     = 150,
-};
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info devkit8000_spi_board_info[] __initdata = {
-	{
-		.modalias		= "ads7846",
-		.bus_num		= 2,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(OMAP3_DEVKIT_TS_GPIO),
-		.platform_data		= &ads7846_config,
-	}
-};
-
 #define OMAP_DM9000_BASE	0x2c000000
 
 static struct resource omap_dm9000_resources[] = {
@@ -550,14 +483,14 @@ static void __init omap_dm9000_init(void)
 {
 	unsigned char *eth_addr = omap_dm9000_platdata.dev_addr;
 	struct omap_die_id odi;
+	int ret;
 
-	if (gpio_request(OMAP_DM9000_GPIO_IRQ, "dm9000 irq") < 0) {
+	ret = gpio_request_one(OMAP_DM9000_GPIO_IRQ, GPIOF_IN, "dm9000 irq");
+	if (ret < 0) {
 		printk(KERN_ERR "Failed to request GPIO%d for dm9000 IRQ\n",
 			OMAP_DM9000_GPIO_IRQ);
 		return;
-		}
-
-	gpio_direction_input(OMAP_DM9000_GPIO_IRQ);
+	}
 
 	/* init the mac address using DIE id */
 	omap_get_die_id(&odi);
@@ -576,45 +509,6 @@ static struct platform_device *devkit8000_devices[] __initdata = {
 	&omap_dm9000_dev,
 };
 
-static void __init devkit8000_flash_init(void)
-{
-	u8 cs = 0;
-	u8 nandcs = GPMC_CS_NUM + 1;
-
-	/* find out the chip-select on which NAND exists */
-	while (cs < GPMC_CS_NUM) {
-		u32 ret = 0;
-		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
-
-		if ((ret & 0xC00) == 0x800) {
-			printk(KERN_INFO "Found NAND on CS%d\n", cs);
-			if (nandcs > GPMC_CS_NUM)
-				nandcs = cs;
-		}
-		cs++;
-	}
-
-	if (nandcs > GPMC_CS_NUM) {
-		printk(KERN_INFO "NAND: Unable to find configuration "
-				 "in GPMC\n ");
-		return;
-	}
-
-	if (nandcs < GPMC_CS_NUM) {
-		devkit8000_nand_data.cs = nandcs;
-
-		printk(KERN_INFO "Registering NAND on CS%d\n", nandcs);
-		if (gpmc_nand_init(&devkit8000_nand_data) < 0)
-			printk(KERN_ERR "Unable to register NAND device\n");
-	}
-}
-
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 
 	.port_mode[0] = OMAP_EHCI_PORT_MODE_PHY,
@@ -795,14 +689,13 @@ static void __init devkit8000_init(void)
 			ARRAY_SIZE(devkit8000_devices));
 
 	omap_display_init(&devkit8000_dss_data);
-	spi_register_board_info(devkit8000_spi_board_info,
-	ARRAY_SIZE(devkit8000_spi_board_info));
 
-	devkit8000_ads7846_init();
+	omap_ads7846_init(2, OMAP3_DEVKIT_TS_GPIO, 0, NULL);
 
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
-	devkit8000_flash_init();
+	omap_nand_flash_init(NAND_BUSWIDTH_16, devkit8000_nand_partitions,
+			     ARRAY_SIZE(devkit8000_nand_partitions));
 
 	/* Ensure SDRC pins are mux'd for self-refresh */
 	omap_mux_init_signal("sdrc_cke0", OMAP_PIN_OUTPUT);
diff --git a/arch/arm/mach-omap2/board-igep0020.c b/arch/arm/mach-omap2/board-igep0020.c
index 3da64d3..0c1bfca 100644
--- a/arch/arm/mach-omap2/board-igep0020.c
+++ b/arch/arm/mach-omap2/board-igep0020.c
@@ -38,6 +38,7 @@
 #include "mux.h"
 #include "hsmmc.h"
 #include "sdram-numonyx-m65kxxxxam.h"
+#include "common-board-devices.h"
 
 #define IGEP2_SMSC911X_CS       5
 #define IGEP2_SMSC911X_GPIO     176
@@ -54,6 +55,11 @@
 #define IGEP2_RC_GPIO_WIFI_NRESET  139
 #define IGEP2_RC_GPIO_BT_NRESET    137
 
+#define IGEP3_GPIO_LED0_GREEN	54
+#define IGEP3_GPIO_LED0_RED	53
+#define IGEP3_GPIO_LED1_RED	16
+#define IGEP3_GPIO_USBH_NRESET  183
+
 /*
  * IGEP2 Hardware Revision Table
  *
@@ -68,6 +74,7 @@
 
 #define IGEP2_BOARD_HWREV_B	0
 #define IGEP2_BOARD_HWREV_C	1
+#define IGEP3_BOARD_HWREV	2
 
 static u8 hwrev;
 
@@ -75,24 +82,29 @@ static void __init igep2_get_revision(void)
 {
 	u8 ret;
 
+	if (machine_is_igep0030()) {
+		hwrev = IGEP3_BOARD_HWREV;
+		return;
+	}
+
 	omap_mux_init_gpio(IGEP2_GPIO_LED1_RED, OMAP_PIN_INPUT);
 
-	if ((gpio_request(IGEP2_GPIO_LED1_RED, "GPIO_HW0_REV") == 0) &&
-	    (gpio_direction_input(IGEP2_GPIO_LED1_RED) == 0)) {
-		ret = gpio_get_value(IGEP2_GPIO_LED1_RED);
-		if (ret == 0) {
-			pr_info("IGEP2: Hardware Revision C (B-NON compatible)\n");
-			hwrev = IGEP2_BOARD_HWREV_C;
-		} else if (ret ==  1) {
-			pr_info("IGEP2: Hardware Revision B/C (B compatible)\n");
-			hwrev = IGEP2_BOARD_HWREV_B;
-		} else {
-			pr_err("IGEP2: Unknown Hardware Revision\n");
-			hwrev = -1;
-		}
-	} else {
+	if (gpio_request_one(IGEP2_GPIO_LED1_RED, GPIOF_IN, "GPIO_HW0_REV")) {
 		pr_warning("IGEP2: Could not obtain gpio GPIO_HW0_REV\n");
 		pr_err("IGEP2: Unknown Hardware Revision\n");
+		return;
+	}
+
+	ret = gpio_get_value(IGEP2_GPIO_LED1_RED);
+	if (ret == 0) {
+		pr_info("IGEP2: Hardware Revision C (B-NON compatible)\n");
+		hwrev = IGEP2_BOARD_HWREV_C;
+	} else if (ret ==  1) {
+		pr_info("IGEP2: Hardware Revision B/C (B compatible)\n");
+		hwrev = IGEP2_BOARD_HWREV_B;
+	} else {
+		pr_err("IGEP2: Unknown Hardware Revision\n");
+		hwrev = -1;
 	}
 
 	gpio_free(IGEP2_GPIO_LED1_RED);
@@ -111,7 +123,7 @@ static void __init igep2_get_revision(void)
  * So MTD regards it as 4KiB page size and 256KiB block size 64*(2*2048)
  */
 
-static struct mtd_partition igep2_onenand_partitions[] = {
+static struct mtd_partition igep_onenand_partitions[] = {
 	{
 		.name           = "X-Loader",
 		.offset         = 0,
@@ -139,21 +151,21 @@ static struct mtd_partition igep2_onenand_partitions[] = {
 	},
 };
 
-static struct omap_onenand_platform_data igep2_onenand_data = {
-	.parts = igep2_onenand_partitions,
-	.nr_parts = ARRAY_SIZE(igep2_onenand_partitions),
+static struct omap_onenand_platform_data igep_onenand_data = {
+	.parts = igep_onenand_partitions,
+	.nr_parts = ARRAY_SIZE(igep_onenand_partitions),
 	.dma_channel	= -1,	/* disable DMA in OMAP OneNAND driver */
 };
 
-static struct platform_device igep2_onenand_device = {
+static struct platform_device igep_onenand_device = {
 	.name		= "omap2-onenand",
 	.id		= -1,
 	.dev = {
-		.platform_data = &igep2_onenand_data,
+		.platform_data = &igep_onenand_data,
 	},
 };
 
-static void __init igep2_flash_init(void)
+static void __init igep_flash_init(void)
 {
 	u8 cs = 0;
 	u8 onenandcs = GPMC_CS_NUM + 1;
@@ -165,7 +177,7 @@ static void __init igep2_flash_init(void)
 		/* Check if NAND/oneNAND is configured */
 		if ((ret & 0xC00) == 0x800)
 			/* NAND found */
-			pr_err("IGEP2: Unsupported NAND found\n");
+			pr_err("IGEP: Unsupported NAND found\n");
 		else {
 			ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG7);
 			if ((ret & 0x3F) == (ONENAND_MAP >> 24))
@@ -175,85 +187,46 @@ static void __init igep2_flash_init(void)
 	}
 
 	if (onenandcs > GPMC_CS_NUM) {
-		pr_err("IGEP2: Unable to find configuration in GPMC\n");
+		pr_err("IGEP: Unable to find configuration in GPMC\n");
 		return;
 	}
 
-	igep2_onenand_data.cs = onenandcs;
+	igep_onenand_data.cs = onenandcs;
 
-	if (platform_device_register(&igep2_onenand_device) < 0)
-		pr_err("IGEP2: Unable to register OneNAND device\n");
+	if (platform_device_register(&igep_onenand_device) < 0)
+		pr_err("IGEP: Unable to register OneNAND device\n");
 }
 
 #else
-static void __init igep2_flash_init(void) {}
+static void __init igep_flash_init(void) {}
 #endif
 
 #if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
 
 #include <linux/smsc911x.h>
+#include <plat/gpmc-smsc911x.h>
 
-static struct smsc911x_platform_config igep2_smsc911x_config = {
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
-	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS  ,
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-};
-
-static struct resource igep2_smsc911x_resources[] = {
-	{
-		.flags	= IORESOURCE_MEM,
-	},
-	{
-		.start	= OMAP_GPIO_IRQ(IGEP2_SMSC911X_GPIO),
-		.end	= OMAP_GPIO_IRQ(IGEP2_SMSC911X_GPIO),
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct platform_device igep2_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= 0,
-	.num_resources	= ARRAY_SIZE(igep2_smsc911x_resources),
-	.resource	= igep2_smsc911x_resources,
-	.dev		= {
-		.platform_data = &igep2_smsc911x_config,
-	},
+static struct omap_smsc911x_platform_data smsc911x_cfg = {
+	.cs             = IGEP2_SMSC911X_CS,
+	.gpio_irq       = IGEP2_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS,
 };
 
 static inline void __init igep2_init_smsc911x(void)
 {
-	unsigned long cs_mem_base;
-
-	if (gpmc_cs_request(IGEP2_SMSC911X_CS, SZ_16M, &cs_mem_base) < 0) {
-		pr_err("IGEP v2: Failed request for GPMC mem for smsc911x\n");
-		gpmc_cs_free(IGEP2_SMSC911X_CS);
-		return;
-	}
-
-	igep2_smsc911x_resources[0].start = cs_mem_base + 0x0;
-	igep2_smsc911x_resources[0].end   = cs_mem_base + 0xff;
-
-	if ((gpio_request(IGEP2_SMSC911X_GPIO, "SMSC911X IRQ") == 0) &&
-	    (gpio_direction_input(IGEP2_SMSC911X_GPIO) == 0)) {
-		gpio_export(IGEP2_SMSC911X_GPIO, 0);
-	} else {
-		pr_err("IGEP v2: Could not obtain gpio for for SMSC911X IRQ\n");
-		return;
-	}
-
-	platform_device_register(&igep2_smsc911x_device);
+	gpmc_smsc911x_init(&smsc911x_cfg);
 }
 
 #else
 static inline void __init igep2_init_smsc911x(void) { }
 #endif
 
-static struct regulator_consumer_supply igep2_vmmc1_supply =
+static struct regulator_consumer_supply igep_vmmc1_supply =
 	REGULATOR_SUPPLY("vmmc", "omap_hsmmc.0");
 
 /* VMMC1 for OMAP VDD_MMC1 (i/o) and MMC1 card */
-static struct regulator_init_data igep2_vmmc1 = {
+static struct regulator_init_data igep_vmmc1 = {
 	.constraints = {
 		.min_uV			= 1850000,
 		.max_uV			= 3150000,
@@ -264,13 +237,13 @@ static struct regulator_init_data igep2_vmmc1 = {
 					| REGULATOR_CHANGE_STATUS,
 	},
 	.num_consumer_supplies  = 1,
-	.consumer_supplies      = &igep2_vmmc1_supply,
+	.consumer_supplies      = &igep_vmmc1_supply,
 };
 
-static struct regulator_consumer_supply igep2_vio_supply =
+static struct regulator_consumer_supply igep_vio_supply =
 	REGULATOR_SUPPLY("vmmc_aux", "omap_hsmmc.1");
 
-static struct regulator_init_data igep2_vio = {
+static struct regulator_init_data igep_vio = {
 	.constraints = {
 		.min_uV			= 1800000,
 		.max_uV			= 1800000,
@@ -282,34 +255,34 @@ static struct regulator_init_data igep2_vio = {
 					| REGULATOR_CHANGE_STATUS,
 	},
 	.num_consumer_supplies  = 1,
-	.consumer_supplies      = &igep2_vio_supply,
+	.consumer_supplies      = &igep_vio_supply,
 };
 
-static struct regulator_consumer_supply igep2_vmmc2_supply =
+static struct regulator_consumer_supply igep_vmmc2_supply =
 	REGULATOR_SUPPLY("vmmc", "omap_hsmmc.1");
 
-static struct regulator_init_data igep2_vmmc2 = {
+static struct regulator_init_data igep_vmmc2 = {
 	.constraints		= {
 		.valid_modes_mask	= REGULATOR_MODE_NORMAL,
 		.always_on		= 1,
 	},
 	.num_consumer_supplies	= 1,
-	.consumer_supplies	= &igep2_vmmc2_supply,
+	.consumer_supplies	= &igep_vmmc2_supply,
 };
 
-static struct fixed_voltage_config igep2_vwlan = {
+static struct fixed_voltage_config igep_vwlan = {
 	.supply_name		= "vwlan",
 	.microvolts		= 3300000,
 	.gpio			= -EINVAL,
 	.enabled_at_boot	= 1,
-	.init_data		= &igep2_vmmc2,
+	.init_data		= &igep_vmmc2,
 };
 
-static struct platform_device igep2_vwlan_device = {
+static struct platform_device igep_vwlan_device = {
 	.name		= "reg-fixed-voltage",
 	.id		= 0,
 	.dev = {
-		.platform_data	= &igep2_vwlan,
+		.platform_data	= &igep_vwlan,
 	},
 };
 
@@ -334,20 +307,17 @@ static struct omap2_hsmmc_info mmc[] = {
 #if defined(CONFIG_LEDS_GPIO) || defined(CONFIG_LEDS_GPIO_MODULE)
 #include <linux/leds.h>
 
-static struct gpio_led igep2_gpio_leds[] = {
+static struct gpio_led igep_gpio_leds[] = {
 	[0] = {
 		.name			= "gpio-led:red:d0",
-		.gpio			= IGEP2_GPIO_LED0_RED,
 		.default_trigger	= "default-off"
 	},
 	[1] = {
 		.name			= "gpio-led:green:d0",
-		.gpio			= IGEP2_GPIO_LED0_GREEN,
 		.default_trigger	= "default-off",
 	},
 	[2] = {
 		.name			= "gpio-led:red:d1",
-		.gpio			= IGEP2_GPIO_LED1_RED,
 		.default_trigger	= "default-off",
 	},
 	[3] = {
@@ -358,94 +328,119 @@ static struct gpio_led igep2_gpio_leds[] = {
 	},
 };
 
-static struct gpio_led_platform_data igep2_led_pdata = {
-	.leds           = igep2_gpio_leds,
-	.num_leds       = ARRAY_SIZE(igep2_gpio_leds),
+static struct gpio_led_platform_data igep_led_pdata = {
+	.leds           = igep_gpio_leds,
+	.num_leds       = ARRAY_SIZE(igep_gpio_leds),
 };
 
-static struct platform_device igep2_led_device = {
+static struct platform_device igep_led_device = {
 	 .name   = "leds-gpio",
 	 .id     = -1,
 	 .dev    = {
-		 .platform_data  =  &igep2_led_pdata,
+		 .platform_data  =  &igep_led_pdata,
 	},
 };
 
-static void __init igep2_leds_init(void)
+static void __init igep_leds_init(void)
 {
-	platform_device_register(&igep2_led_device);
+	if (machine_is_igep0020()) {
+		igep_gpio_leds[0].gpio = IGEP2_GPIO_LED0_RED;
+		igep_gpio_leds[1].gpio = IGEP2_GPIO_LED0_GREEN;
+		igep_gpio_leds[2].gpio = IGEP2_GPIO_LED1_RED;
+	} else {
+		igep_gpio_leds[0].gpio = IGEP3_GPIO_LED0_RED;
+		igep_gpio_leds[1].gpio = IGEP3_GPIO_LED0_GREEN;
+		igep_gpio_leds[2].gpio = IGEP3_GPIO_LED1_RED;
+	}
+
+	platform_device_register(&igep_led_device);
 }
 
 #else
-static inline void igep2_leds_init(void)
+static struct gpio igep_gpio_leds[] __initdata = {
+	{ -EINVAL,	GPIOF_OUT_INIT_LOW, "gpio-led:red:d0"   },
+	{ -EINVAL,	GPIOF_OUT_INIT_LOW, "gpio-led:green:d0" },
+	{ -EINVAL,	GPIOF_OUT_INIT_LOW, "gpio-led:red:d1"   },
+};
+
+static inline void igep_leds_init(void)
 {
-	if ((gpio_request(IGEP2_GPIO_LED0_RED, "gpio-led:red:d0") == 0) &&
-	    (gpio_direction_output(IGEP2_GPIO_LED0_RED, 0) == 0))
-		gpio_export(IGEP2_GPIO_LED0_RED, 0);
-	else
-		pr_warning("IGEP v2: Could not obtain gpio GPIO_LED0_RED\n");
+	int i;
 
-	if ((gpio_request(IGEP2_GPIO_LED0_GREEN, "gpio-led:green:d0") == 0) &&
-	    (gpio_direction_output(IGEP2_GPIO_LED0_GREEN, 0) == 0))
-		gpio_export(IGEP2_GPIO_LED0_GREEN, 0);
-	else
-		pr_warning("IGEP v2: Could not obtain gpio GPIO_LED0_GREEN\n");
+	if (machine_is_igep0020()) {
+		igep_gpio_leds[0].gpio = IGEP2_GPIO_LED0_RED;
+		igep_gpio_leds[1].gpio = IGEP2_GPIO_LED0_GREEN;
+		igep_gpio_leds[2].gpio = IGEP2_GPIO_LED1_RED;
+	} else {
+		igep_gpio_leds[0].gpio = IGEP3_GPIO_LED0_RED;
+		igep_gpio_leds[1].gpio = IGEP3_GPIO_LED0_GREEN;
+		igep_gpio_leds[2].gpio = IGEP3_GPIO_LED1_RED;
+	}
 
-	if ((gpio_request(IGEP2_GPIO_LED1_RED, "gpio-led:red:d1") == 0) &&
-	    (gpio_direction_output(IGEP2_GPIO_LED1_RED, 0) == 0))
-		gpio_export(IGEP2_GPIO_LED1_RED, 0);
-	else
-		pr_warning("IGEP v2: Could not obtain gpio GPIO_LED1_RED\n");
+	if (gpio_request_array(igep_gpio_leds, ARRAY_SIZE(igep_gpio_leds))) {
+		pr_warning("IGEP v2: Could not obtain leds gpios\n");
+		return;
+	}
 
+	for (i = 0; i < ARRAY_SIZE(igep_gpio_leds); i++)
+		gpio_export(igep_gpio_leds[i].gpio, 0);
 }
 #endif
 
-static int igep2_twl_gpio_setup(struct device *dev,
+static struct gpio igep2_twl_gpios[] = {
+	{ -EINVAL, GPIOF_IN,		"GPIO_EHCI_NOC"  },
+	{ -EINVAL, GPIOF_OUT_INIT_LOW,	"GPIO_USBH_CPEN" },
+};
+
+static int igep_twl_gpio_setup(struct device *dev,
 		unsigned gpio, unsigned ngpio)
 {
+	int ret;
+
 	/* gpio + 0 is "mmc0_cd" (input/IRQ) */
 	mmc[0].gpio_cd = gpio + 0;
 	omap2_hsmmc_init(mmc);
 
-	/*
-	 * REVISIT: need ehci-omap hooks for external VBUS
-	 * power switch and overcurrent detect
-	 */
-	if ((gpio_request(gpio + 1, "GPIO_EHCI_NOC") < 0) ||
-	    (gpio_direction_input(gpio + 1) < 0))
-		pr_err("IGEP2: Could not obtain gpio for EHCI NOC");
-
-	/*
-	 * TWL4030_GPIO_MAX + 0 == ledA, GPIO_USBH_CPEN
-	 * (out, active low)
-	 */
-	if ((gpio_request(gpio + TWL4030_GPIO_MAX, "GPIO_USBH_CPEN") < 0) ||
-	    (gpio_direction_output(gpio + TWL4030_GPIO_MAX, 0) < 0))
-		pr_err("IGEP2: Could not obtain gpio for USBH_CPEN");
-
 	/* TWL4030_GPIO_MAX + 1 == ledB (out, active low LED) */
 #if !defined(CONFIG_LEDS_GPIO) && !defined(CONFIG_LEDS_GPIO_MODULE)
-	if ((gpio_request(gpio+TWL4030_GPIO_MAX+1, "gpio-led:green:d1") == 0)
-	    && (gpio_direction_output(gpio + TWL4030_GPIO_MAX + 1, 1) == 0))
+	ret = gpio_request_one(gpio + TWL4030_GPIO_MAX + 1, GPIOF_OUT_INIT_HIGH,
+			       "gpio-led:green:d1");
+	if (ret == 0)
 		gpio_export(gpio + TWL4030_GPIO_MAX + 1, 0);
 	else
-		pr_warning("IGEP v2: Could not obtain gpio GPIO_LED1_GREEN\n");
+		pr_warning("IGEP: Could not obtain gpio GPIO_LED1_GREEN\n");
 #else
-	igep2_gpio_leds[3].gpio = gpio + TWL4030_GPIO_MAX + 1;
+	igep_gpio_leds[3].gpio = gpio + TWL4030_GPIO_MAX + 1;
 #endif
 
+	if (machine_is_igep0030())
+		return 0;
+
+	/*
+	 * REVISIT: need ehci-omap hooks for external VBUS
+	 * power switch and overcurrent detect
+	 */
+	igep2_twl_gpios[0].gpio = gpio + 1;
+
+	/* TWL4030_GPIO_MAX + 0 == ledA, GPIO_USBH_CPEN (out, active low) */
+	igep2_twl_gpios[1].gpio = gpio + TWL4030_GPIO_MAX;
+
+	ret = gpio_request_array(igep2_twl_gpios, ARRAY_SIZE(igep2_twl_gpios));
+	if (ret < 0)
+		pr_err("IGEP2: Could not obtain gpio for USBH_CPEN");
+
 	return 0;
 };
 
-static struct twl4030_gpio_platform_data igep2_twl4030_gpio_pdata = {
+static struct twl4030_gpio_platform_data igep_twl4030_gpio_pdata = {
 	.gpio_base	= OMAP_MAX_GPIO_LINES,
 	.irq_base	= TWL4030_GPIO_IRQ_BASE,
 	.irq_end	= TWL4030_GPIO_IRQ_END,
 	.use_leds	= true,
-	.setup		= igep2_twl_gpio_setup,
+	.setup		= igep_twl_gpio_setup,
 };
 
-static struct twl4030_usb_data igep2_usb_data = {
+static struct twl4030_usb_data igep_usb_data = {
 	.usb_mode	= T2_USB_MODE_ULPI,
 };
 
@@ -507,16 +502,17 @@ static struct regulator_init_data igep2_vpll2 = {
 
 static void __init igep2_display_init(void)
 {
-	if (gpio_request(IGEP2_GPIO_DVI_PUP, "GPIO_DVI_PUP") &&
-	    gpio_direction_output(IGEP2_GPIO_DVI_PUP, 1))
+	int err = gpio_request_one(IGEP2_GPIO_DVI_PUP, GPIOF_OUT_INIT_HIGH,
+				   "GPIO_DVI_PUP");
+	if (err)
 		pr_err("IGEP v2: Could not obtain gpio GPIO_DVI_PUP\n");
 }
 
-static struct platform_device *igep2_devices[] __initdata = {
-	&igep2_vwlan_device,
+static struct platform_device *igep_devices[] __initdata = {
+	&igep_vwlan_device,
 };
 
-static void __init igep2_init_early(void)
+static void __init igep_init_early(void)
 {
 	omap2_init_common_infrastructure();
 	omap2_init_common_devices(m65kxxxxam_sdrc_params,
@@ -561,27 +557,15 @@ static struct twl4030_keypad_data igep2_keypad_pdata = {
 	.rep		= 1,
 };
 
-static struct twl4030_platform_data igep2_twldata = {
+static struct twl4030_platform_data igep_twldata = {
 	.irq_base	= TWL4030_IRQ_BASE,
 	.irq_end	= TWL4030_IRQ_END,
 
 	/* platform_data for children goes here */
-	.usb		= &igep2_usb_data,
-	.codec		= &igep2_codec_data,
-	.gpio		= &igep2_twl4030_gpio_pdata,
-	.keypad		= &igep2_keypad_pdata,
-	.vmmc1          = &igep2_vmmc1,
-	.vpll2		= &igep2_vpll2,
-	.vio		= &igep2_vio,
-};
-
-static struct i2c_board_info __initdata igep2_i2c1_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags		= I2C_CLIENT_WAKE,
-		.irq		= INT_34XX_SYS_NIRQ,
-		.platform_data	= &igep2_twldata,
-	},
+	.usb		= &igep_usb_data,
+	.gpio		= &igep_twl4030_gpio_pdata,
+	.vmmc1          = &igep_vmmc1,
+	.vio		= &igep_vio,
 };
 
 static struct i2c_board_info __initdata igep2_i2c3_boardinfo[] = {
@@ -590,32 +574,29 @@ static struct i2c_board_info __initdata igep2_i2c3_boardinfo[] = {
 	},
 };
 
-static void __init igep2_i2c_init(void)
+static void __init igep_i2c_init(void)
 {
 	int ret;
 
-	ret = omap_register_i2c_bus(1, 2600, igep2_i2c1_boardinfo,
-		ARRAY_SIZE(igep2_i2c1_boardinfo));
-	if (ret)
-		pr_warning("IGEP2: Could not register I2C1 bus (%d)\n", ret);
+	if (machine_is_igep0020()) {
+		/*
+		 * Bus 3 is attached to the DVI port where devices like the
+		 * pico DLP projector don't work reliably with 400kHz
+		 */
+		ret = omap_register_i2c_bus(3, 100, igep2_i2c3_boardinfo,
+					    ARRAY_SIZE(igep2_i2c3_boardinfo));
+		if (ret)
+			pr_warning("IGEP2: Could not register I2C3 bus (%d)\n", ret);
+
+		igep_twldata.codec	= &igep2_codec_data;
+		igep_twldata.keypad	= &igep2_keypad_pdata;
+		igep_twldata.vpll2	= &igep2_vpll2;
+	}
 
-	/*
-	 * Bus 3 is attached to the DVI port where devices like the pico DLP
-	 * projector don't work reliably with 400kHz
-	 */
-	ret = omap_register_i2c_bus(3, 100, igep2_i2c3_boardinfo,
-		ARRAY_SIZE(igep2_i2c3_boardinfo));
-	if (ret)
-		pr_warning("IGEP2: Could not register I2C3 bus (%d)\n", ret);
+	omap3_pmic_init("twl4030", &igep_twldata);
 }
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
-static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
+static const struct usbhs_omap_board_data igep2_usbhs_bdata __initconst = {
 	.port_mode[0] = OMAP_EHCI_PORT_MODE_PHY,
 	.port_mode[1] = OMAP_USBHS_PORT_MODE_UNUSED,
 	.port_mode[2] = OMAP_USBHS_PORT_MODE_UNUSED,
@@ -626,6 +607,17 @@ static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 	.reset_gpio_port[2] = -EINVAL,
 };
 
+static const struct usbhs_omap_board_data igep3_usbhs_bdata __initconst = {
+	.port_mode[0] = OMAP_USBHS_PORT_MODE_UNUSED,
+	.port_mode[1] = OMAP_EHCI_PORT_MODE_PHY,
+	.port_mode[2] = OMAP_USBHS_PORT_MODE_UNUSED,
+
+	.phy_reset = true,
+	.reset_gpio_port[0] = -EINVAL,
+	.reset_gpio_port[1] = IGEP3_GPIO_USBH_NRESET,
+	.reset_gpio_port[2] = -EINVAL,
+};
+
 #ifdef CONFIG_OMAP_MUX
 static struct omap_board_mux board_mux[] __initdata = {
 	{ .reg_offset = OMAP_MUX_TERMINATOR },
@@ -633,82 +625,95 @@ static struct omap_board_mux board_mux[] __initdata = {
 #endif
 
 #if defined(CONFIG_LIBERTAS_SDIO) || defined(CONFIG_LIBERTAS_SDIO_MODULE)
+static struct gpio igep_wlan_bt_gpios[] __initdata = {
+	{ -EINVAL, GPIOF_OUT_INIT_HIGH, "GPIO_WIFI_NPD"	   },
+	{ -EINVAL, GPIOF_OUT_INIT_HIGH, "GPIO_WIFI_NRESET" },
+	{ -EINVAL, GPIOF_OUT_INIT_HIGH, "GPIO_BT_NRESET"   },
+};
 
-static void __init igep2_wlan_bt_init(void)
+static void __init igep_wlan_bt_init(void)
 {
-	unsigned npd, wreset, btreset;
+	int err;
 
 	/* GPIO's for WLAN-BT combo depends on hardware revision */
 	if (hwrev == IGEP2_BOARD_HWREV_B) {
-		npd = IGEP2_RB_GPIO_WIFI_NPD;
-		wreset = IGEP2_RB_GPIO_WIFI_NRESET;
-		btreset = IGEP2_RB_GPIO_BT_NRESET;
-	} else if (hwrev == IGEP2_BOARD_HWREV_C) {
-		npd = IGEP2_RC_GPIO_WIFI_NPD;
-		wreset = IGEP2_RC_GPIO_WIFI_NRESET;
-		btreset = IGEP2_RC_GPIO_BT_NRESET;
+		igep_wlan_bt_gpios[0].gpio = IGEP2_RB_GPIO_WIFI_NPD;
+		igep_wlan_bt_gpios[1].gpio = IGEP2_RB_GPIO_WIFI_NRESET;
+		igep_wlan_bt_gpios[2].gpio = IGEP2_RB_GPIO_BT_NRESET;
+	} else if (hwrev == IGEP2_BOARD_HWREV_C || machine_is_igep0030()) {
+		igep_wlan_bt_gpios[0].gpio = IGEP2_RC_GPIO_WIFI_NPD;
+		igep_wlan_bt_gpios[1].gpio = IGEP2_RC_GPIO_WIFI_NRESET;
+		igep_wlan_bt_gpios[2].gpio = IGEP2_RC_GPIO_BT_NRESET;
 	} else
 		return;
 
-	/* Set GPIO's for  WLAN-BT combo module */
-	if ((gpio_request(npd, "GPIO_WIFI_NPD") == 0) &&
-	    (gpio_direction_output(npd, 1) == 0)) {
-		gpio_export(npd, 0);
-	} else
-		pr_warning("IGEP2: Could not obtain gpio GPIO_WIFI_NPD\n");
-
-	if ((gpio_request(wreset, "GPIO_WIFI_NRESET") == 0) &&
-	    (gpio_direction_output(wreset, 1) == 0)) {
-		gpio_export(wreset, 0);
-		gpio_set_value(wreset, 0);
-		udelay(10);
-		gpio_set_value(wreset, 1);
-	} else
-		pr_warning("IGEP2: Could not obtain gpio GPIO_WIFI_NRESET\n");
+	err = gpio_request_array(igep_wlan_bt_gpios,
+				 ARRAY_SIZE(igep_wlan_bt_gpios));
+	if (err) {
+		pr_warning("IGEP2: Could not obtain WIFI/BT gpios\n");
+		return;
+	}
+
+	gpio_export(igep_wlan_bt_gpios[0].gpio, 0);
+	gpio_export(igep_wlan_bt_gpios[1].gpio, 0);
+	gpio_export(igep_wlan_bt_gpios[2].gpio, 0);
+
+	gpio_set_value(igep_wlan_bt_gpios[1].gpio, 0);
+	udelay(10);
+	gpio_set_value(igep_wlan_bt_gpios[1].gpio, 1);
 
-	if ((gpio_request(btreset, "GPIO_BT_NRESET") == 0) &&
-	    (gpio_direction_output(btreset, 1) == 0)) {
-		gpio_export(btreset, 0);
-	} else
-		pr_warning("IGEP2: Could not obtain gpio GPIO_BT_NRESET\n");
 }
 #else
-static inline void __init igep2_wlan_bt_init(void) { }
+static inline void __init igep_wlan_bt_init(void) { }
 #endif
 
-static void __init igep2_init(void)
+static void __init igep_init(void)
 {
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
 
 	/* Get IGEP2 hardware revision */
 	igep2_get_revision();
 	/* Register I2C busses and drivers */
-	igep2_i2c_init();
-	platform_add_devices(igep2_devices, ARRAY_SIZE(igep2_devices));
-	omap_display_init(&igep2_dss_data);
+	igep_i2c_init();
+	platform_add_devices(igep_devices, ARRAY_SIZE(igep_devices));
 	omap_serial_init();
-	usb_musb_init(&musb_board_data);
-	usbhs_init(&usbhs_bdata);
+	usb_musb_init(NULL);
 
-	igep2_flash_init();
-	igep2_leds_init();
-	igep2_display_init();
-	igep2_init_smsc911x();
+	igep_flash_init();
+	igep_leds_init();
 
 	/*
 	 * WLAN-BT combo module from MuRata which has a Marvell WLAN
 	 * (88W8686) + CSR Bluetooth chipset. Uses SDIO interface.
 	 */
-	igep2_wlan_bt_init();
+	igep_wlan_bt_init();
 
+	if (machine_is_igep0020()) {
+		omap_display_init(&igep2_dss_data);
+		igep2_display_init();
+		igep2_init_smsc911x();
+		usbhs_init(&igep2_usbhs_bdata);
+	} else {
+		usbhs_init(&igep3_usbhs_bdata);
+	}
 }
 
 MACHINE_START(IGEP0020, "IGEP v2 board")
 	.boot_params	= 0x80000100,
 	.reserve	= omap_reserve,
 	.map_io		= omap3_map_io,
-	.init_early	= igep2_init_early,
+	.init_early	= igep_init_early,
+	.init_irq	= omap_init_irq,
+	.init_machine	= igep_init,
+	.timer		= &omap_timer,
+MACHINE_END
+
+MACHINE_START(IGEP0030, "IGEP OMAP3 module")
+	.boot_params	= 0x80000100,
+	.reserve	= omap_reserve,
+	.map_io		= omap3_map_io,
+	.init_early	= igep_init_early,
 	.init_irq	= omap_init_irq,
-	.init_machine	= igep2_init,
+	.init_machine	= igep_init,
 	.timer		= &omap_timer,
 MACHINE_END
diff --git a/arch/arm/mach-omap2/board-igep0030.c b/arch/arm/mach-omap2/board-igep0030.c
deleted file mode 100644
index 2cf86c3..0000000
--- a/arch/arm/mach-omap2/board-igep0030.c
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (C) 2010 - ISEE 2007 SL
- *
- * Modified from mach-omap2/board-generic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-#include <linux/interrupt.h>
-
-#include <linux/regulator/machine.h>
-#include <linux/regulator/fixed.h>
-#include <linux/i2c/twl.h>
-#include <linux/mmc/host.h>
-
-#include <asm/mach-types.h>
-#include <asm/mach/arch.h>
-
-#include <plat/board.h>
-#include <plat/common.h>
-#include <plat/gpmc.h>
-#include <plat/usb.h>
-#include <plat/onenand.h>
-
-#include "mux.h"
-#include "hsmmc.h"
-#include "sdram-numonyx-m65kxxxxam.h"
-
-#define IGEP3_GPIO_LED0_GREEN	54
-#define IGEP3_GPIO_LED0_RED	53
-#define IGEP3_GPIO_LED1_RED	16
-
-#define IGEP3_GPIO_WIFI_NPD	138
-#define IGEP3_GPIO_WIFI_NRESET	139
-#define IGEP3_GPIO_BT_NRESET	137
-
-#define IGEP3_GPIO_USBH_NRESET  183
-
-
-#if defined(CONFIG_MTD_ONENAND_OMAP2) || \
-	defined(CONFIG_MTD_ONENAND_OMAP2_MODULE)
-
-#define ONENAND_MAP             0x20000000
-
-/*
- * x2 Flash built-in COMBO POP MEMORY
- * Since the device is equipped with two DataRAMs, and two-plane NAND
- * Flash memory array, these two component enables simultaneous program
- * of 4KiB. Plane1 has only even blocks such as block0, block2, block4
- * while Plane2 has only odd blocks such as block1, block3, block5.
- * So MTD regards it as 4KiB page size and 256KiB block size 64*(2*2048)
- */
-
-static struct mtd_partition igep3_onenand_partitions[] = {
-	{
-		.name           = "X-Loader",
-		.offset         = 0,
-		.size           = 2 * (64*(2*2048))
-	},
-	{
-		.name           = "U-Boot",
-		.offset         = MTDPART_OFS_APPEND,
-		.size           = 6 * (64*(2*2048)),
-	},
-	{
-		.name           = "Environment",
-		.offset         = MTDPART_OFS_APPEND,
-		.size           = 2 * (64*(2*2048)),
-	},
-	{
-		.name           = "Kernel",
-		.offset         = MTDPART_OFS_APPEND,
-		.size           = 12 * (64*(2*2048)),
-	},
-	{
-		.name           = "File System",
-		.offset         = MTDPART_OFS_APPEND,
-		.size           = MTDPART_SIZ_FULL,
-	},
-};
-
-static struct omap_onenand_platform_data igep3_onenand_pdata = {
-	.parts = igep3_onenand_partitions,
-	.nr_parts = ARRAY_SIZE(igep3_onenand_partitions),
-	.onenand_setup = NULL,
-	.dma_channel	= -1,	/* disable DMA in OMAP OneNAND driver */
-};
-
-static struct platform_device igep3_onenand_device = {
-	.name		= "omap2-onenand",
-	.id		= -1,
-	.dev = {
-		.platform_data = &igep3_onenand_pdata,
-	},
-};
-
-static void __init igep3_flash_init(void)
-{
-	u8 cs = 0;
-	u8 onenandcs = GPMC_CS_NUM + 1;
-
-	for (cs = 0; cs < GPMC_CS_NUM; cs++) {
-		u32 ret;
-		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
-
-		/* Check if NAND/oneNAND is configured */
-		if ((ret & 0xC00) == 0x800)
-			/* NAND found */
-			pr_err("IGEP3: Unsupported NAND found\n");
-		else {
-			ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG7);
-
-			if ((ret & 0x3F) == (ONENAND_MAP >> 24))
-				/* OneNAND found */
-				onenandcs = cs;
-		}
-	}
-
-	if (onenandcs > GPMC_CS_NUM) {
-		pr_err("IGEP3: Unable to find configuration in GPMC\n");
-		return;
-	}
-
-	igep3_onenand_pdata.cs = onenandcs;
-
-	if (platform_device_register(&igep3_onenand_device) < 0)
-		pr_err("IGEP3: Unable to register OneNAND device\n");
-}
-
-#else
-static void __init igep3_flash_init(void) {}
-#endif
-
-static struct regulator_consumer_supply igep3_vmmc1_supply =
-	REGULATOR_SUPPLY("vmmc", "omap_hsmmc.0");
-
-/* VMMC1 for OMAP VDD_MMC1 (i/o) and MMC1 card */
-static struct regulator_init_data igep3_vmmc1 = {
-	.constraints = {
-		.min_uV			= 1850000,
-		.max_uV			= 3150000,
-		.valid_modes_mask	= REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-		.valid_ops_mask		= REGULATOR_CHANGE_VOLTAGE
-					| REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-	},
-	.num_consumer_supplies  = 1,
-	.consumer_supplies      = &igep3_vmmc1_supply,
-};
-
-static struct regulator_consumer_supply igep3_vio_supply =
-	REGULATOR_SUPPLY("vmmc_aux", "omap_hsmmc.1");
-
-static struct regulator_init_data igep3_vio = {
-	.constraints = {
-		.min_uV			= 1800000,
-		.max_uV			= 1800000,
-		.apply_uV		= 1,
-		.valid_modes_mask	= REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-		.valid_ops_mask		= REGULATOR_CHANGE_VOLTAGE
-					| REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-	},
-	.num_consumer_supplies	= 1,
-	.consumer_supplies	= &igep3_vio_supply,
-};
-
-static struct regulator_consumer_supply igep3_vmmc2_supply =
-	REGULATOR_SUPPLY("vmmc", "omap_hsmmc.1");
-
-static struct regulator_init_data igep3_vmmc2 = {
-	.constraints	= {
-		.valid_modes_mask	= REGULATOR_MODE_NORMAL,
-		.always_on		= 1,
-	},
-	.num_consumer_supplies	= 1,
-	.consumer_supplies	= &igep3_vmmc2_supply,
-};
-
-static struct fixed_voltage_config igep3_vwlan = {
-	.supply_name		= "vwlan",
-	.microvolts		= 3300000,
-	.gpio			= -EINVAL,
-	.enabled_at_boot	= 1,
-	.init_data		= &igep3_vmmc2,
-};
-
-static struct platform_device igep3_vwlan_device = {
-	.name	= "reg-fixed-voltage",
-	.id	= 0,
-	.dev	= {
-		.platform_data = &igep3_vwlan,
-	},
-};
-
-static struct omap2_hsmmc_info mmc[] = {
-	[0] = {
-		.mmc		= 1,
-		.caps		= MMC_CAP_4_BIT_DATA,
-		.gpio_cd	= -EINVAL,
-		.gpio_wp	= -EINVAL,
-	},
-#if defined(CONFIG_LIBERTAS_SDIO) || defined(CONFIG_LIBERTAS_SDIO_MODULE)
-	[1] = {
-		.mmc		= 2,
-		.caps		= MMC_CAP_4_BIT_DATA,
-		.gpio_cd	= -EINVAL,
-		.gpio_wp	= -EINVAL,
-	},
-#endif
-	{}      /* Terminator */
-};
-
-#if defined(CONFIG_LEDS_GPIO) || defined(CONFIG_LEDS_GPIO_MODULE)
-#include <linux/leds.h>
-
-static struct gpio_led igep3_gpio_leds[] = {
-	[0] = {
-		.name			= "gpio-led:red:d0",
-		.gpio			= IGEP3_GPIO_LED0_RED,
-		.default_trigger	= "default-off"
-	},
-	[1] = {
-		.name			= "gpio-led:green:d0",
-		.gpio			= IGEP3_GPIO_LED0_GREEN,
-		.default_trigger	= "default-off",
-	},
-	[2] = {
-		.name			= "gpio-led:red:d1",
-		.gpio			= IGEP3_GPIO_LED1_RED,
-		.default_trigger	= "default-off",
-	},
-	[3] = {
-		.name			= "gpio-led:green:d1",
-		.default_trigger	= "heartbeat",
-		.gpio			= -EINVAL, /* gets replaced */
-	},
-};
-
-static struct gpio_led_platform_data igep3_led_pdata = {
-	.leds           = igep3_gpio_leds,
-	.num_leds       = ARRAY_SIZE(igep3_gpio_leds),
-};
-
-static struct platform_device igep3_led_device = {
-	 .name   = "leds-gpio",
-	 .id     = -1,
-	 .dev    = {
-		 .platform_data = &igep3_led_pdata,
-	},
-};
-
-static void __init igep3_leds_init(void)
-{
-	platform_device_register(&igep3_led_device);
-}
-
-#else
-static inline void igep3_leds_init(void)
-{
-	if ((gpio_request(IGEP3_GPIO_LED0_RED, "gpio-led:red:d0") == 0) &&
-	    (gpio_direction_output(IGEP3_GPIO_LED0_RED, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_LED0_RED, 0);
-		gpio_set_value(IGEP3_GPIO_LED0_RED, 1);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_LED0_RED\n");
-
-	if ((gpio_request(IGEP3_GPIO_LED0_GREEN, "gpio-led:green:d0") == 0) &&
-	    (gpio_direction_output(IGEP3_GPIO_LED0_GREEN, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_LED0_GREEN, 0);
-		gpio_set_value(IGEP3_GPIO_LED0_GREEN, 1);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_LED0_GREEN\n");
-
-	if ((gpio_request(IGEP3_GPIO_LED1_RED, "gpio-led:red:d1") == 0) &&
-		(gpio_direction_output(IGEP3_GPIO_LED1_RED, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_LED1_RED, 0);
-		gpio_set_value(IGEP3_GPIO_LED1_RED, 1);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_LED1_RED\n");
-}
-#endif
-
-static int igep3_twl4030_gpio_setup(struct device *dev,
-		unsigned gpio, unsigned ngpio)
-{
-	/* gpio + 0 is "mmc0_cd" (input/IRQ) */
-	mmc[0].gpio_cd = gpio + 0;
-	omap2_hsmmc_init(mmc);
-
-	/* TWL4030_GPIO_MAX + 1 == ledB (out, active low LED) */
-#if !defined(CONFIG_LEDS_GPIO) && !defined(CONFIG_LEDS_GPIO_MODULE)
-	if ((gpio_request(gpio+TWL4030_GPIO_MAX+1, "gpio-led:green:d1") == 0)
-	    && (gpio_direction_output(gpio + TWL4030_GPIO_MAX + 1, 1) == 0)) {
-		gpio_export(gpio + TWL4030_GPIO_MAX + 1, 0);
-		gpio_set_value(gpio + TWL4030_GPIO_MAX + 1, 0);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_LED1_GREEN\n");
-#else
-	igep3_gpio_leds[3].gpio = gpio + TWL4030_GPIO_MAX + 1;
-#endif
-
-	return 0;
-};
-
-static struct twl4030_gpio_platform_data igep3_twl4030_gpio_pdata = {
-	.gpio_base	= OMAP_MAX_GPIO_LINES,
-	.irq_base	= TWL4030_GPIO_IRQ_BASE,
-	.irq_end	= TWL4030_GPIO_IRQ_END,
-	.use_leds	= true,
-	.setup		= igep3_twl4030_gpio_setup,
-};
-
-static struct twl4030_usb_data igep3_twl4030_usb_data = {
-	.usb_mode	= T2_USB_MODE_ULPI,
-};
-
-static struct platform_device *igep3_devices[] __initdata = {
-	&igep3_vwlan_device,
-};
-
-static void __init igep3_init_early(void)
-{
-	omap2_init_common_infrastructure();
-	omap2_init_common_devices(m65kxxxxam_sdrc_params,
-				  m65kxxxxam_sdrc_params);
-}
-
-static struct twl4030_platform_data igep3_twl4030_pdata = {
-	.irq_base	= TWL4030_IRQ_BASE,
-	.irq_end	= TWL4030_IRQ_END,
-
-	/* platform_data for children goes here */
-	.usb		= &igep3_twl4030_usb_data,
-	.gpio		= &igep3_twl4030_gpio_pdata,
-	.vmmc1		= &igep3_vmmc1,
-	.vio		= &igep3_vio,
-};
-
-static struct i2c_board_info __initdata igep3_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags		= I2C_CLIENT_WAKE,
-		.irq		= INT_34XX_SYS_NIRQ,
-		.platform_data	= &igep3_twl4030_pdata,
-	},
-};
-
-static int __init igep3_i2c_init(void)
-{
-	omap_register_i2c_bus(1, 2600, igep3_i2c_boardinfo,
-			ARRAY_SIZE(igep3_i2c_boardinfo));
-
-	return 0;
-}
-
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type	= MUSB_INTERFACE_ULPI,
-	.mode		= MUSB_OTG,
-	.power		= 100,
-};
-
-#if defined(CONFIG_LIBERTAS_SDIO) || defined(CONFIG_LIBERTAS_SDIO_MODULE)
-
-static void __init igep3_wifi_bt_init(void)
-{
-	/* Configure MUX values for W-LAN + Bluetooth GPIO's */
-	omap_mux_init_gpio(IGEP3_GPIO_WIFI_NPD, OMAP_PIN_OUTPUT);
-	omap_mux_init_gpio(IGEP3_GPIO_WIFI_NRESET, OMAP_PIN_OUTPUT);
-	omap_mux_init_gpio(IGEP3_GPIO_BT_NRESET, OMAP_PIN_OUTPUT);
-
-	/* Set GPIO's for  W-LAN + Bluetooth combo module */
-	if ((gpio_request(IGEP3_GPIO_WIFI_NPD, "GPIO_WIFI_NPD") == 0) &&
-	    (gpio_direction_output(IGEP3_GPIO_WIFI_NPD, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_WIFI_NPD, 0);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_WIFI_NPD\n");
-
-	if ((gpio_request(IGEP3_GPIO_WIFI_NRESET, "GPIO_WIFI_NRESET") == 0) &&
-	    (gpio_direction_output(IGEP3_GPIO_WIFI_NRESET, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_WIFI_NRESET, 0);
-		gpio_set_value(IGEP3_GPIO_WIFI_NRESET, 0);
-		udelay(10);
-		gpio_set_value(IGEP3_GPIO_WIFI_NRESET, 1);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_WIFI_NRESET\n");
-
-	if ((gpio_request(IGEP3_GPIO_BT_NRESET, "GPIO_BT_NRESET") == 0) &&
-	    (gpio_direction_output(IGEP3_GPIO_BT_NRESET, 1) == 0)) {
-		gpio_export(IGEP3_GPIO_BT_NRESET, 0);
-	} else
-		pr_warning("IGEP3: Could not obtain gpio GPIO_BT_NRESET\n");
-}
-#else
-void __init igep3_wifi_bt_init(void) {}
-#endif
-
-static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
-	.port_mode[0] = OMAP_USBHS_PORT_MODE_UNUSED,
-	.port_mode[1] = OMAP_EHCI_PORT_MODE_PHY,
-	.port_mode[2] = OMAP_USBHS_PORT_MODE_UNUSED,
-
-	.phy_reset = true,
-	.reset_gpio_port[0] = -EINVAL,
-	.reset_gpio_port[1] = IGEP3_GPIO_USBH_NRESET,
-	.reset_gpio_port[2] = -EINVAL,
-};
-
-#ifdef CONFIG_OMAP_MUX
-static struct omap_board_mux board_mux[] __initdata = {
-	OMAP3_MUX(I2C2_SDA, OMAP_MUX_MODE4 | OMAP_PIN_OUTPUT),
-	{ .reg_offset = OMAP_MUX_TERMINATOR },
-};
-#endif
-
-static void __init igep3_init(void)
-{
-	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
-
-	/* Register I2C busses and drivers */
-	igep3_i2c_init();
-	platform_add_devices(igep3_devices, ARRAY_SIZE(igep3_devices));
-	omap_serial_init();
-	usb_musb_init(&musb_board_data);
-	usbhs_init(&usbhs_bdata);
-
-	igep3_flash_init();
-	igep3_leds_init();
-
-	/*
-	 * WLAN-BT combo module from MuRata which has a Marvell WLAN
-	 * (88W8686) + CSR Bluetooth chipset. Uses SDIO interface.
-	 */
-	igep3_wifi_bt_init();
-
-}
-
-MACHINE_START(IGEP0030, "IGEP OMAP3 module")
-	.boot_params	= 0x80000100,
-	.reserve	= omap_reserve,
-	.map_io		= omap3_map_io,
-	.init_early	= igep3_init_early,
-	.init_irq	= omap_init_irq,
-	.init_machine	= igep3_init,
-	.timer		= &omap_timer,
-MACHINE_END
diff --git a/arch/arm/mach-omap2/board-ldp.c b/arch/arm/mach-omap2/board-ldp.c
index e2ba779..f7d6038 100644
--- a/arch/arm/mach-omap2/board-ldp.c
+++ b/arch/arm/mach-omap2/board-ldp.c
@@ -22,7 +22,6 @@
 #include <linux/err.h>
 #include <linux/clk.h>
 #include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
 #include <linux/regulator/machine.h>
 #include <linux/i2c/twl.h>
 #include <linux/io.h>
@@ -43,47 +42,19 @@
 
 #include <asm/delay.h>
 #include <plat/usb.h>
+#include <plat/gpmc-smsc911x.h>
 
 #include "board-flash.h"
 #include "mux.h"
 #include "hsmmc.h"
 #include "control.h"
+#include "common-board-devices.h"
 
 #define LDP_SMSC911X_CS		1
 #define LDP_SMSC911X_GPIO	152
 #define DEBUG_BASE		0x08000000
 #define LDP_ETHR_START		DEBUG_BASE
 
-static struct resource ldp_smsc911x_resources[] = {
-	[0] = {
-		.start	= LDP_ETHR_START,
-		.end	= LDP_ETHR_START + SZ_4K,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= 0,
-		.end	= 0,
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct smsc911x_platform_config ldp_smsc911x_config = {
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
-	.flags		= SMSC911X_USE_32BIT,
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-};
-
-static struct platform_device ldp_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(ldp_smsc911x_resources),
-	.resource	= ldp_smsc911x_resources,
-	.dev		= {
-		.platform_data = &ldp_smsc911x_config,
-	},
-};
-
 static uint32_t board_keymap[] = {
 	KEY(0, 0, KEY_1),
 	KEY(1, 0, KEY_2),
@@ -197,82 +168,16 @@ static struct platform_device ldp_gpio_keys_device = {
 	},
 };
 
-static int ts_gpio;
-
-/**
- * @brief ads7846_dev_init : Requests & sets GPIO line for pen-irq
- *
- * @return - void. If request gpio fails then Flag KERN_ERR.
- */
-static void ads7846_dev_init(void)
-{
-	if (gpio_request(ts_gpio, "ads7846 irq") < 0) {
-		printk(KERN_ERR "can't get ads746 pen down GPIO\n");
-		return;
-	}
-
-	gpio_direction_input(ts_gpio);
-	gpio_set_debounce(ts_gpio, 310);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(ts_gpio);
-}
-
-static struct ads7846_platform_data tsc2046_config __initdata = {
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-};
-
-static struct omap2_mcspi_device_config tsc2046_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info ldp_spi_board_info[] __initdata = {
-	[0] = {
-		/*
-		 * TSC2046 operates at a max freqency of 2MHz, so
-		 * operate slightly below at 1.5MHz
-		 */
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &tsc2046_mcspi_config,
-		.irq			= 0,
-		.platform_data		= &tsc2046_config,
-	},
+static struct omap_smsc911x_platform_data smsc911x_cfg = {
+	.cs             = LDP_SMSC911X_CS,
+	.gpio_irq       = LDP_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT,
 };
 
 static inline void __init ldp_init_smsc911x(void)
 {
-	int eth_cs;
-	unsigned long cs_mem_base;
-	int eth_gpio = 0;
-
-	eth_cs = LDP_SMSC911X_CS;
-
-	if (gpmc_cs_request(eth_cs, SZ_16M, &cs_mem_base) < 0) {
-		printk(KERN_ERR "Failed to request GPMC mem for smsc911x\n");
-		return;
-	}
-
-	ldp_smsc911x_resources[0].start = cs_mem_base + 0x0;
-	ldp_smsc911x_resources[0].end   = cs_mem_base + 0xff;
-	udelay(100);
-
-	eth_gpio = LDP_SMSC911X_GPIO;
-
-	ldp_smsc911x_resources[1].start = OMAP_GPIO_IRQ(eth_gpio);
-
-	if (gpio_request(eth_gpio, "smsc911x irq") < 0) {
-		printk(KERN_ERR "Failed to request GPIO%d for smsc911x IRQ\n",
-				eth_gpio);
-		return;
-	}
-	gpio_direction_input(eth_gpio);
+	gpmc_smsc911x_init(&smsc911x_cfg);
 }
 
 static struct platform_device ldp_lcd_device = {
@@ -360,19 +265,9 @@ static struct twl4030_platform_data ldp_twldata = {
 	.keypad		= &ldp_kp_twl4030_data,
 };
 
-static struct i2c_board_info __initdata ldp_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &ldp_twldata,
-	},
-};
-
 static int __init omap_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, ldp_i2c_boardinfo,
-			ARRAY_SIZE(ldp_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &ldp_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, NULL, 0);
 	return 0;
@@ -389,7 +284,6 @@ static struct omap2_hsmmc_info mmc[] __initdata = {
 };
 
 static struct platform_device *ldp_devices[] __initdata = {
-	&ldp_smsc911x_device,
 	&ldp_lcd_device,
 	&ldp_gpio_keys_device,
 };
@@ -400,12 +294,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static struct mtd_partition ldp_nand_partitions[] = {
 	/* All the partition sizes are listed in terms of NAND block size */
 	{
@@ -446,13 +334,9 @@ static void __init omap_ldp_init(void)
 	ldp_init_smsc911x();
 	omap_i2c_init();
 	platform_add_devices(ldp_devices, ARRAY_SIZE(ldp_devices));
-	ts_gpio = 54;
-	ldp_spi_board_info[0].irq = gpio_to_irq(ts_gpio);
-	spi_register_board_info(ldp_spi_board_info,
-				ARRAY_SIZE(ldp_spi_board_info));
-	ads7846_dev_init();
+	omap_ads7846_init(1, 54, 310, NULL);
 	omap_serial_init();
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	board_nand_init(ldp_nand_partitions,
 		ARRAY_SIZE(ldp_nand_partitions), ZOOM_NAND_CS, 0);
 
diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c
index e710cd9..8d74318 100644
--- a/arch/arm/mach-omap2/board-n8x0.c
+++ b/arch/arm/mach-omap2/board-n8x0.c
@@ -106,14 +106,13 @@ static void __init n8x0_usb_init(void)
 	static char	announce[] __initdata = KERN_INFO "TUSB 6010\n";
 
 	/* PM companion chip power control pin */
-	ret = gpio_request(TUSB6010_GPIO_ENABLE, "TUSB6010 enable");
+	ret = gpio_request_one(TUSB6010_GPIO_ENABLE, GPIOF_OUT_INIT_LOW,
+			       "TUSB6010 enable");
 	if (ret != 0) {
 		printk(KERN_ERR "Could not get TUSB power GPIO%i\n",
 		       TUSB6010_GPIO_ENABLE);
 		return;
 	}
-	gpio_direction_output(TUSB6010_GPIO_ENABLE, 0);
-
 	tusb_set_power(0);
 
 	ret = tusb6010_setup_interface(&tusb_data, TUSB6010_REFCLK_19, 2,
@@ -494,8 +493,12 @@ static struct omap_mmc_platform_data mmc1_data = {
 
 static struct omap_mmc_platform_data *mmc_data[OMAP24XX_NR_MMC];
 
-static void __init n8x0_mmc_init(void)
+static struct gpio n810_emmc_gpios[] __initdata = {
+	{ N810_EMMC_VSD_GPIO, GPIOF_OUT_INIT_LOW,  "MMC slot 2 Vddf" },
+	{ N810_EMMC_VIO_GPIO, GPIOF_OUT_INIT_LOW,  "MMC slot 2 Vdd"  },
+};
 
+static void __init n8x0_mmc_init(void)
 {
 	int err;
 
@@ -512,27 +515,18 @@ static void __init n8x0_mmc_init(void)
 		mmc1_data.slots[1].ban_openended = 1;
 	}
 
-	err = gpio_request(N8X0_SLOT_SWITCH_GPIO, "MMC slot switch");
+	err = gpio_request_one(N8X0_SLOT_SWITCH_GPIO, GPIOF_OUT_INIT_LOW,
+			       "MMC slot switch");
 	if (err)
 		return;
 
-	gpio_direction_output(N8X0_SLOT_SWITCH_GPIO, 0);
-
 	if (machine_is_nokia_n810()) {
-		err = gpio_request(N810_EMMC_VSD_GPIO, "MMC slot 2 Vddf");
-		if (err) {
-			gpio_free(N8X0_SLOT_SWITCH_GPIO);
-			return;
-		}
-		gpio_direction_output(N810_EMMC_VSD_GPIO, 0);
-
-		err = gpio_request(N810_EMMC_VIO_GPIO, "MMC slot 2 Vdd");
+		err = gpio_request_array(n810_emmc_gpios,
+					 ARRAY_SIZE(n810_emmc_gpios));
 		if (err) {
 			gpio_free(N8X0_SLOT_SWITCH_GPIO);
-			gpio_free(N810_EMMC_VSD_GPIO);
 			return;
 		}
-		gpio_direction_output(N810_EMMC_VIO_GPIO, 0);
 	}
 
 	mmc_data[0] = &mmc1_data;
diff --git a/arch/arm/mach-omap2/board-omap3beagle.c b/arch/arm/mach-omap2/board-omap3beagle.c
index 97750d4..be71426 100644
--- a/arch/arm/mach-omap2/board-omap3beagle.c
+++ b/arch/arm/mach-omap2/board-omap3beagle.c
@@ -52,6 +52,7 @@
 #include "hsmmc.h"
 #include "timer-gp.h"
 #include "pm.h"
+#include "common-board-devices.h"
 
 #define NAND_BLOCK_SIZE		SZ_128K
 
@@ -79,6 +80,12 @@ static u8 omap3_beagle_get_rev(void)
 	return omap3_beagle_version;
 }
 
+static struct gpio omap3_beagle_rev_gpios[] __initdata = {
+	{ 171, GPIOF_IN, "rev_id_0"    },
+	{ 172, GPIOF_IN, "rev_id_1" },
+	{ 173, GPIOF_IN, "rev_id_2"    },
+};
+
 static void __init omap3_beagle_init_rev(void)
 {
 	int ret;
@@ -88,21 +95,13 @@ static void __init omap3_beagle_init_rev(void)
 	omap_mux_init_gpio(172, OMAP_PIN_INPUT_PULLUP);
 	omap_mux_init_gpio(173, OMAP_PIN_INPUT_PULLUP);
 
-	ret = gpio_request(171, "rev_id_0");
-	if (ret < 0)
-		goto fail0;
-
-	ret = gpio_request(172, "rev_id_1");
-	if (ret < 0)
-		goto fail1;
-
-	ret = gpio_request(173, "rev_id_2");
-	if (ret < 0)
-		goto fail2;
-
-	gpio_direction_input(171);
-	gpio_direction_input(172);
-	gpio_direction_input(173);
+	ret = gpio_request_array(omap3_beagle_rev_gpios,
+				 ARRAY_SIZE(omap3_beagle_rev_gpios));
+	if (ret < 0) {
+		printk(KERN_ERR "Unable to get revision detection GPIO pins\n");
+		omap3_beagle_version = OMAP3BEAGLE_BOARD_UNKN;
+		return;
+	}
 
 	beagle_rev = gpio_get_value(171) | (gpio_get_value(172) << 1)
 			| (gpio_get_value(173) << 2);
@@ -128,18 +127,6 @@ static void __init omap3_beagle_init_rev(void)
 		printk(KERN_INFO "OMAP3 Beagle Rev: unknown %hd\n", beagle_rev);
 		omap3_beagle_version = OMAP3BEAGLE_BOARD_UNKN;
 	}
-
-	return;
-
-fail2:
-	gpio_free(172);
-fail1:
-	gpio_free(171);
-fail0:
-	printk(KERN_ERR "Unable to get revision detection GPIO pins\n");
-	omap3_beagle_version = OMAP3BEAGLE_BOARD_UNKN;
-
-	return;
 }
 
 static struct mtd_partition omap3beagle_nand_partitions[] = {
@@ -173,15 +160,6 @@ static struct mtd_partition omap3beagle_nand_partitions[] = {
 	},
 };
 
-static struct omap_nand_platform_data omap3beagle_nand_data = {
-	.options	= NAND_BUSWIDTH_16,
-	.parts		= omap3beagle_nand_partitions,
-	.nr_parts	= ARRAY_SIZE(omap3beagle_nand_partitions),
-	.dma_channel	= -1,		/* disable DMA in OMAP NAND driver */
-	.nand_setup	= NULL,
-	.dev_ready	= NULL,
-};
-
 /* DSS */
 
 static int beagle_enable_dvi(struct omap_dss_device *dssdev)
@@ -243,13 +221,10 @@ static void __init beagle_display_init(void)
 {
 	int r;
 
-	r = gpio_request(beagle_dvi_device.reset_gpio, "DVI reset");
-	if (r < 0) {
+	r = gpio_request_one(beagle_dvi_device.reset_gpio, GPIOF_OUT_INIT_LOW,
+			     "DVI reset");
+	if (r < 0)
 		printk(KERN_ERR "Unable to get DVI reset GPIO\n");
-		return;
-	}
-
-	gpio_direction_output(beagle_dvi_device.reset_gpio, 0);
 }
 
 #include "sdram-micron-mt46h32m32lf-6.h"
@@ -276,7 +251,7 @@ static struct gpio_led gpio_leds[];
 static int beagle_twl_gpio_setup(struct device *dev,
 		unsigned gpio, unsigned ngpio)
 {
-	int r;
+	int r, usb_pwr_level;
 
 	if (omap3_beagle_get_rev() == OMAP3BEAGLE_BOARD_XM) {
 		mmc[0].gpio_wp = -EINVAL;
@@ -295,66 +270,46 @@ static int beagle_twl_gpio_setup(struct device *dev,
 	beagle_vmmc1_supply.dev = mmc[0].dev;
 	beagle_vsim_supply.dev = mmc[0].dev;
 
-	/* REVISIT: need ehci-omap hooks for external VBUS
-	 * power switch and overcurrent detect
-	 */
-	if (omap3_beagle_get_rev() != OMAP3BEAGLE_BOARD_XM) {
-		r = gpio_request(gpio + 1, "EHCI_nOC");
-		if (!r) {
-			r = gpio_direction_input(gpio + 1);
-			if (r)
-				gpio_free(gpio + 1);
-		}
-		if (r)
-			pr_err("%s: unable to configure EHCI_nOC\n", __func__);
-	}
-
 	/*
 	 * TWL4030_GPIO_MAX + 0 == ledA, EHCI nEN_USB_PWR (out, XM active
 	 * high / others active low)
-	 */
-	gpio_request(gpio + TWL4030_GPIO_MAX, "nEN_USB_PWR");
-	if (omap3_beagle_get_rev() == OMAP3BEAGLE_BOARD_XM)
-		gpio_direction_output(gpio + TWL4030_GPIO_MAX, 1);
-	else
-		gpio_direction_output(gpio + TWL4030_GPIO_MAX, 0);
-
-	/* DVI reset GPIO is different between beagle revisions */
-	if (omap3_beagle_get_rev() == OMAP3BEAGLE_BOARD_XM)
-		beagle_dvi_device.reset_gpio = 129;
-	else
-		beagle_dvi_device.reset_gpio = 170;
-
-	/* TWL4030_GPIO_MAX + 1 == ledB, PMU_STAT (out, active low LED) */
-	gpio_leds[2].gpio = gpio + TWL4030_GPIO_MAX + 1;
-
-	/*
-	 * gpio + 1 on Xm controls the TFP410's enable line (active low)
-	 * gpio + 2 control varies depending on the board rev as follows:
-	 * P7/P8 revisions(prototype): Camera EN
-	 * A2+ revisions (production): LDO (supplies DVI, serial, led blocks)
+	 * DVI reset GPIO is different between beagle revisions
 	 */
 	if (omap3_beagle_get_rev() == OMAP3BEAGLE_BOARD_XM) {
-		r = gpio_request(gpio + 1, "nDVI_PWR_EN");
-		if (!r) {
-			r = gpio_direction_output(gpio + 1, 0);
-			if (r)
-				gpio_free(gpio + 1);
-		}
+		usb_pwr_level = GPIOF_OUT_INIT_HIGH;
+		beagle_dvi_device.reset_gpio = 129;
+		/*
+		 * gpio + 1 on Xm controls the TFP410's enable line (active low)
+		 * gpio + 2 control varies depending on the board rev as below:
+		 * P7/P8 revisions(prototype): Camera EN
+		 * A2+ revisions (production): LDO (DVI, serial, led blocks)
+		 */
+		r = gpio_request_one(gpio + 1, GPIOF_OUT_INIT_LOW,
+				     "nDVI_PWR_EN");
 		if (r)
 			pr_err("%s: unable to configure nDVI_PWR_EN\n",
 				__func__);
-		r = gpio_request(gpio + 2, "DVI_LDO_EN");
-		if (!r) {
-			r = gpio_direction_output(gpio + 2, 1);
-			if (r)
-				gpio_free(gpio + 2);
-		}
+		r = gpio_request_one(gpio + 2, GPIOF_OUT_INIT_HIGH,
+				     "DVI_LDO_EN");
 		if (r)
 			pr_err("%s: unable to configure DVI_LDO_EN\n",
 				__func__);
+	} else {
+		usb_pwr_level = GPIOF_OUT_INIT_LOW;
+		beagle_dvi_device.reset_gpio = 170;
+		/*
+		 * REVISIT: need ehci-omap hooks for external VBUS
+		 * power switch and overcurrent detect
+		 */
+		if (gpio_request_one(gpio + 1, GPIOF_IN, "EHCI_nOC"))
+			pr_err("%s: unable to configure EHCI_nOC\n", __func__);
 	}
 
+	gpio_request_one(gpio + TWL4030_GPIO_MAX, usb_pwr_level, "nEN_USB_PWR");
+
+	/* TWL4030_GPIO_MAX + 1 == ledB, PMU_STAT (out, active low LED) */
+	gpio_leds[2].gpio = gpio + TWL4030_GPIO_MAX + 1;
+
 	return 0;
 }
 
@@ -453,15 +408,6 @@ static struct twl4030_platform_data beagle_twldata = {
 	.vpll2		= &beagle_vpll2,
 };
 
-static struct i2c_board_info __initdata beagle_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &beagle_twldata,
-	},
-};
-
 static struct i2c_board_info __initdata beagle_i2c_eeprom[] = {
        {
                I2C_BOARD_INFO("eeprom", 0x50),
@@ -470,8 +416,7 @@ static struct i2c_board_info __initdata beagle_i2c_eeprom[] = {
 
 static int __init omap3_beagle_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, beagle_i2c_boardinfo,
-			ARRAY_SIZE(beagle_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &beagle_twldata);
 	/* Bus 3 is attached to the DVI port where devices like the pico DLP
 	 * projector don't work reliably with 400kHz */
 	omap_register_i2c_bus(3, 100, beagle_i2c_eeprom, ARRAY_SIZE(beagle_i2c_eeprom));
@@ -551,39 +496,6 @@ static struct platform_device *omap3_beagle_devices[] __initdata = {
 	&keys_gpio,
 };
 
-static void __init omap3beagle_flash_init(void)
-{
-	u8 cs = 0;
-	u8 nandcs = GPMC_CS_NUM + 1;
-
-	/* find out the chip-select on which NAND exists */
-	while (cs < GPMC_CS_NUM) {
-		u32 ret = 0;
-		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
-
-		if ((ret & 0xC00) == 0x800) {
-			printk(KERN_INFO "Found NAND on CS%d\n", cs);
-			if (nandcs > GPMC_CS_NUM)
-				nandcs = cs;
-		}
-		cs++;
-	}
-
-	if (nandcs > GPMC_CS_NUM) {
-		printk(KERN_INFO "NAND: Unable to find configuration "
-				 "in GPMC\n ");
-		return;
-	}
-
-	if (nandcs < GPMC_CS_NUM) {
-		omap3beagle_nand_data.cs = nandcs;
-
-		printk(KERN_INFO "Registering NAND on CS%d\n", nandcs);
-		if (gpmc_nand_init(&omap3beagle_nand_data) < 0)
-			printk(KERN_ERR "Unable to register NAND device\n");
-	}
-}
-
 static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 
 	.port_mode[0] = OMAP_EHCI_PORT_MODE_PHY,
@@ -602,12 +514,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static void __init beagle_opp_init(void)
 {
 	int r = 0;
@@ -665,13 +571,13 @@ static void __init omap3_beagle_init(void)
 	omap_serial_init();
 
 	omap_mux_init_gpio(170, OMAP_PIN_INPUT);
-	gpio_request(170, "DVI_nPD");
 	/* REVISIT leave DVI powered down until it's needed ... */
-	gpio_direction_output(170, true);
+	gpio_request_one(170, GPIOF_OUT_INIT_HIGH, "DVI_nPD");
 
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
-	omap3beagle_flash_init();
+	omap_nand_flash_init(NAND_BUSWIDTH_16, omap3beagle_nand_partitions,
+			     ARRAY_SIZE(omap3beagle_nand_partitions));
 
 	/* Ensure SDRC pins are mux'd for self-refresh */
 	omap_mux_init_signal("sdrc_cke0", OMAP_PIN_OUTPUT);
diff --git a/arch/arm/mach-omap2/board-omap3evm.c b/arch/arm/mach-omap2/board-omap3evm.c
index 7f94ccc..b4d4346 100644
--- a/arch/arm/mach-omap2/board-omap3evm.c
+++ b/arch/arm/mach-omap2/board-omap3evm.c
@@ -50,6 +50,7 @@
 #include "mux.h"
 #include "sdram-micron-mt46h32m32lf-6.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define OMAP3_EVM_TS_GPIO	175
 #define OMAP3_EVM_EHCI_VBUS	22
@@ -101,49 +102,20 @@ static void __init omap3_evm_get_revision(void)
 }
 
 #if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
-static struct resource omap3evm_smsc911x_resources[] = {
-	[0] =	{
-		.start	= OMAP3EVM_ETHR_START,
-		.end	= (OMAP3EVM_ETHR_START + OMAP3EVM_ETHR_SIZE - 1),
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] =	{
-		.start	= OMAP_GPIO_IRQ(OMAP3EVM_ETHR_GPIO_IRQ),
-		.end	= OMAP_GPIO_IRQ(OMAP3EVM_ETHR_GPIO_IRQ),
-		.flags	= (IORESOURCE_IRQ | IRQF_TRIGGER_LOW),
-	},
-};
+#include <plat/gpmc-smsc911x.h>
 
-static struct smsc911x_platform_config smsc911x_config = {
-	.phy_interface  = PHY_INTERFACE_MODE_MII,
-	.irq_polarity   = SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type       = SMSC911X_IRQ_TYPE_OPEN_DRAIN,
-	.flags          = (SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS),
-};
-
-static struct platform_device omap3evm_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(omap3evm_smsc911x_resources),
-	.resource	= &omap3evm_smsc911x_resources[0],
-	.dev		= {
-		.platform_data = &smsc911x_config,
-	},
+static struct omap_smsc911x_platform_data smsc911x_cfg = {
+	.cs             = OMAP3EVM_SMSC911X_CS,
+	.gpio_irq       = OMAP3EVM_ETHR_GPIO_IRQ,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS,
 };
 
 static inline void __init omap3evm_init_smsc911x(void)
 {
-	int eth_cs, eth_rst;
 	struct clk *l3ck;
 	unsigned int rate;
 
-	if (get_omap3_evm_rev() == OMAP3EVM_BOARD_GEN_1)
-		eth_rst = OMAP3EVM_GEN1_ETHR_GPIO_RST;
-	else
-		eth_rst = OMAP3EVM_GEN2_ETHR_GPIO_RST;
-
-	eth_cs = OMAP3EVM_SMSC911X_CS;
-
 	l3ck = clk_get(NULL, "l3_ck");
 	if (IS_ERR(l3ck))
 		rate = 100000000;
@@ -152,33 +124,13 @@ static inline void __init omap3evm_init_smsc911x(void)
 
 	/* Configure ethernet controller reset gpio */
 	if (cpu_is_omap3430()) {
-		if (gpio_request(eth_rst, "SMSC911x gpio") < 0) {
-			pr_err(KERN_ERR "Failed to request %d for smsc911x\n",
-					eth_rst);
-			return;
-		}
-
-		if (gpio_direction_output(eth_rst, 1) < 0) {
-			pr_err(KERN_ERR "Failed to set direction of %d for" \
-					" smsc911x\n", eth_rst);
-			return;
-		}
-		/* reset pulse to ethernet controller*/
-		usleep_range(150, 220);
-		gpio_set_value(eth_rst, 0);
-		usleep_range(150, 220);
-		gpio_set_value(eth_rst, 1);
-		usleep_range(1, 2);
-	}
-
-	if (gpio_request(OMAP3EVM_ETHR_GPIO_IRQ, "SMSC911x irq") < 0) {
-		printk(KERN_ERR "Failed to request GPIO%d for smsc911x IRQ\n",
-			OMAP3EVM_ETHR_GPIO_IRQ);
-		return;
+		if (get_omap3_evm_rev() == OMAP3EVM_BOARD_GEN_1)
+			smsc911x_cfg.gpio_reset = OMAP3EVM_GEN1_ETHR_GPIO_RST;
+		else
+			smsc911x_cfg.gpio_reset = OMAP3EVM_GEN2_ETHR_GPIO_RST;
 	}
 
-	gpio_direction_input(OMAP3EVM_ETHR_GPIO_IRQ);
-	platform_device_register(&omap3evm_smsc911x_device);
+	gpmc_smsc911x_init(&smsc911x_cfg);
 }
 
 #else
@@ -197,6 +149,15 @@ static inline void __init omap3evm_init_smsc911x(void) { return; }
 #define OMAP3EVM_LCD_PANEL_BKLIGHT_GPIO	210
 #define OMAP3EVM_DVI_PANEL_EN_GPIO	199
 
+static struct gpio omap3_evm_dss_gpios[] __initdata = {
+	{ OMAP3EVM_LCD_PANEL_RESB,  GPIOF_OUT_INIT_HIGH, "lcd_panel_resb"  },
+	{ OMAP3EVM_LCD_PANEL_INI,   GPIOF_OUT_INIT_HIGH, "lcd_panel_ini"   },
+	{ OMAP3EVM_LCD_PANEL_QVGA,  GPIOF_OUT_INIT_LOW,  "lcd_panel_qvga"  },
+	{ OMAP3EVM_LCD_PANEL_LR,    GPIOF_OUT_INIT_HIGH, "lcd_panel_lr"    },
+	{ OMAP3EVM_LCD_PANEL_UD,    GPIOF_OUT_INIT_HIGH, "lcd_panel_ud"    },
+	{ OMAP3EVM_LCD_PANEL_ENVDD, GPIOF_OUT_INIT_LOW,  "lcd_panel_envdd" },
+};
+
 static int lcd_enabled;
 static int dvi_enabled;
 
@@ -204,61 +165,10 @@ static void __init omap3_evm_display_init(void)
 {
 	int r;
 
-	r = gpio_request(OMAP3EVM_LCD_PANEL_RESB, "lcd_panel_resb");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_resb\n");
-		return;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_RESB, 1);
-
-	r = gpio_request(OMAP3EVM_LCD_PANEL_INI, "lcd_panel_ini");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_ini\n");
-		goto err_1;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_INI, 1);
-
-	r = gpio_request(OMAP3EVM_LCD_PANEL_QVGA, "lcd_panel_qvga");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_qvga\n");
-		goto err_2;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_QVGA, 0);
-
-	r = gpio_request(OMAP3EVM_LCD_PANEL_LR, "lcd_panel_lr");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_lr\n");
-		goto err_3;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_LR, 1);
-
-	r = gpio_request(OMAP3EVM_LCD_PANEL_UD, "lcd_panel_ud");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_ud\n");
-		goto err_4;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_UD, 1);
-
-	r = gpio_request(OMAP3EVM_LCD_PANEL_ENVDD, "lcd_panel_envdd");
-	if (r) {
-		printk(KERN_ERR "failed to get lcd_panel_envdd\n");
-		goto err_5;
-	}
-	gpio_direction_output(OMAP3EVM_LCD_PANEL_ENVDD, 0);
-
-	return;
-
-err_5:
-	gpio_free(OMAP3EVM_LCD_PANEL_UD);
-err_4:
-	gpio_free(OMAP3EVM_LCD_PANEL_LR);
-err_3:
-	gpio_free(OMAP3EVM_LCD_PANEL_QVGA);
-err_2:
-	gpio_free(OMAP3EVM_LCD_PANEL_INI);
-err_1:
-	gpio_free(OMAP3EVM_LCD_PANEL_RESB);
-
+	r = gpio_request_array(omap3_evm_dss_gpios,
+			       ARRAY_SIZE(omap3_evm_dss_gpios));
+	if (r)
+		printk(KERN_ERR "failed to get lcd_panel_* gpios\n");
 }
 
 static int omap3_evm_enable_lcd(struct omap_dss_device *dssdev)
@@ -448,7 +358,7 @@ static struct platform_device leds_gpio = {
 static int omap3evm_twl_gpio_setup(struct device *dev,
 		unsigned gpio, unsigned ngpio)
 {
-	int r;
+	int r, lcd_bl_en;
 
 	/* gpio + 0 is "mmc0_cd" (input/IRQ) */
 	omap_mux_init_gpio(63, OMAP_PIN_INPUT);
@@ -465,16 +375,14 @@ static int omap3evm_twl_gpio_setup(struct device *dev,
 	 */
 
 	/* TWL4030_GPIO_MAX + 0 == ledA, LCD Backlight control */
-	r = gpio_request(gpio + TWL4030_GPIO_MAX, "EN_LCD_BKL");
-	if (!r)
-		r = gpio_direction_output(gpio + TWL4030_GPIO_MAX,
-			(get_omap3_evm_rev() >= OMAP3EVM_BOARD_GEN_2) ? 1 : 0);
+	lcd_bl_en = get_omap3_evm_rev() >= OMAP3EVM_BOARD_GEN_2 ?
+		GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
+	r = gpio_request_one(gpio + TWL4030_GPIO_MAX, lcd_bl_en, "EN_LCD_BKL");
 	if (r)
 		printk(KERN_ERR "failed to get/set lcd_bkl gpio\n");
 
 	/* gpio + 7 == DVI Enable */
-	gpio_request(gpio + 7, "EN_DVI");
-	gpio_direction_output(gpio + 7, 0);
+	gpio_request_one(gpio + 7, GPIOF_OUT_INIT_LOW, "EN_DVI");
 
 	/* TWL4030_GPIO_MAX + 1 == ledB (out, active low LED) */
 	gpio_leds[2].gpio = gpio + TWL4030_GPIO_MAX + 1;
@@ -652,78 +560,18 @@ static struct twl4030_platform_data omap3evm_twldata = {
 	.vdac		= &omap3_evm_vdac,
 	.vpll2		= &omap3_evm_vpll2,
 	.vio		= &omap3evm_vio,
-};
-
-static struct i2c_board_info __initdata omap3evm_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &omap3evm_twldata,
-	},
+	.vmmc1		= &omap3evm_vmmc1,
+	.vsim		= &omap3evm_vsim,
 };
 
 static int __init omap3_evm_i2c_init(void)
 {
-	/*
-	 * REVISIT: These entries can be set in omap3evm_twl_data
-	 * after a merge with MFD tree
-	 */
-	omap3evm_twldata.vmmc1 = &omap3evm_vmmc1;
-	omap3evm_twldata.vsim = &omap3evm_vsim;
-
-	omap_register_i2c_bus(1, 2600, omap3evm_i2c_boardinfo,
-			ARRAY_SIZE(omap3evm_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &omap3evm_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, NULL, 0);
 	return 0;
 }
 
-static void ads7846_dev_init(void)
-{
-	if (gpio_request(OMAP3_EVM_TS_GPIO, "ADS7846 pendown") < 0)
-		printk(KERN_ERR "can't get ads7846 pen down GPIO\n");
-
-	gpio_direction_input(OMAP3_EVM_TS_GPIO);
-	gpio_set_debounce(OMAP3_EVM_TS_GPIO, 310);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(OMAP3_EVM_TS_GPIO);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max			= 0x0fff,
-	.y_max			= 0x0fff,
-	.x_plate_ohms		= 180,
-	.pressure_max		= 255,
-	.debounce_max		= 10,
-	.debounce_tol		= 3,
-	.debounce_rep		= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-	.settle_delay_usecs	= 150,
-	.wakeup				= true,
-};
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info omap3evm_spi_board_info[] = {
-	[0] = {
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(OMAP3_EVM_TS_GPIO),
-		.platform_data		= &ads7846_config,
-	},
-};
-
 static struct omap_board_config_kernel omap3_evm_config[] __initdata = {
 };
 
@@ -825,6 +673,11 @@ static struct omap_musb_board_data musb_board_data = {
 	.power			= 100,
 };
 
+static struct gpio omap3_evm_ehci_gpios[] __initdata = {
+	{ OMAP3_EVM_EHCI_VBUS,	 GPIOF_OUT_INIT_HIGH,  "enable EHCI VBUS" },
+	{ OMAP3_EVM_EHCI_SELECT, GPIOF_OUT_INIT_LOW,   "select EHCI port" },
+};
+
 static void __init omap3_evm_init(void)
 {
 	omap3_evm_get_revision();
@@ -841,9 +694,6 @@ static void __init omap3_evm_init(void)
 
 	omap_display_init(&omap3_evm_dss_data);
 
-	spi_register_board_info(omap3evm_spi_board_info,
-				ARRAY_SIZE(omap3evm_spi_board_info));
-
 	omap_serial_init();
 
 	/* OMAP3EVM uses ISP1504 phy and so register nop transceiver */
@@ -851,16 +701,12 @@ static void __init omap3_evm_init(void)
 
 	if (get_omap3_evm_rev() >= OMAP3EVM_BOARD_GEN_2) {
 		/* enable EHCI VBUS using GPIO22 */
-		omap_mux_init_gpio(22, OMAP_PIN_INPUT_PULLUP);
-		gpio_request(OMAP3_EVM_EHCI_VBUS, "enable EHCI VBUS");
-		gpio_direction_output(OMAP3_EVM_EHCI_VBUS, 0);
-		gpio_set_value(OMAP3_EVM_EHCI_VBUS, 1);
-
+		omap_mux_init_gpio(OMAP3_EVM_EHCI_VBUS, OMAP_PIN_INPUT_PULLUP);
 		/* Select EHCI port on main board */
-		omap_mux_init_gpio(61, OMAP_PIN_INPUT_PULLUP);
-		gpio_request(OMAP3_EVM_EHCI_SELECT, "select EHCI port");
-		gpio_direction_output(OMAP3_EVM_EHCI_SELECT, 0);
-		gpio_set_value(OMAP3_EVM_EHCI_SELECT, 0);
+		omap_mux_init_gpio(OMAP3_EVM_EHCI_SELECT,
+				   OMAP_PIN_INPUT_PULLUP);
+		gpio_request_array(omap3_evm_ehci_gpios,
+				   ARRAY_SIZE(omap3_evm_ehci_gpios));
 
 		/* setup EHCI phy reset config */
 		omap_mux_init_gpio(21, OMAP_PIN_INPUT_PULLUP);
@@ -876,7 +722,7 @@ static void __init omap3_evm_init(void)
 	}
 	usb_musb_init(&musb_board_data);
 	usbhs_init(&usbhs_bdata);
-	ads7846_dev_init();
+	omap_ads7846_init(1, OMAP3_EVM_TS_GPIO, 310, NULL);
 	omap3evm_init_smsc911x();
 	omap3_evm_display_init();
 
diff --git a/arch/arm/mach-omap2/board-omap3logic.c b/arch/arm/mach-omap2/board-omap3logic.c
index b726943..60d9be4 100644
--- a/arch/arm/mach-omap2/board-omap3logic.c
+++ b/arch/arm/mach-omap2/board-omap3logic.c
@@ -37,6 +37,7 @@
 #include "hsmmc.h"
 #include "timer-gp.h"
 #include "control.h"
+#include "common-board-devices.h"
 
 #include <plat/mux.h>
 #include <plat/board.h>
@@ -93,19 +94,9 @@ static struct twl4030_platform_data omap3logic_twldata = {
 	.vmmc1		= &omap3logic_vmmc1,
 };
 
-static struct i2c_board_info __initdata omap3logic_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &omap3logic_twldata,
-	},
-};
-
 static int __init omap3logic_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, omap3logic_i2c_boardinfo,
-				ARRAY_SIZE(omap3logic_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &omap3logic_twldata);
 	return 0;
 }
 
@@ -147,7 +138,6 @@ static struct omap_smsc911x_platform_data __initdata board_smsc911x_data = {
 	.cs             = OMAP3LOGIC_SMSC911X_CS,
 	.gpio_irq       = -EINVAL,
 	.gpio_reset     = -EINVAL,
-	.flags          = IORESOURCE_IRQ_LOWLEVEL,
 };
 
 /* TODO/FIXME (comment by Peter Barada, LogicPD):
diff --git a/arch/arm/mach-omap2/board-omap3pandora.c b/arch/arm/mach-omap2/board-omap3pandora.c
index 1db1549..1d10736 100644
--- a/arch/arm/mach-omap2/board-omap3pandora.c
+++ b/arch/arm/mach-omap2/board-omap3pandora.c
@@ -22,7 +22,6 @@
 #include <linux/platform_device.h>
 
 #include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
 #include <linux/regulator/machine.h>
 #include <linux/i2c/twl.h>
 #include <linux/wl12xx.h>
@@ -52,6 +51,7 @@
 #include "mux.h"
 #include "sdram-micron-mt46h32m32lf-6.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define PANDORA_WIFI_IRQ_GPIO		21
 #define PANDORA_WIFI_NRESET_GPIO	23
@@ -305,24 +305,13 @@ static int omap3pandora_twl_gpio_setup(struct device *dev,
 
 	/* gpio + 13 drives 32kHz buffer for wifi module */
 	gpio_32khz = gpio + 13;
-	ret = gpio_request(gpio_32khz, "wifi 32kHz");
+	ret = gpio_request_one(gpio_32khz, GPIOF_OUT_INIT_HIGH, "wifi 32kHz");
 	if (ret < 0) {
 		pr_err("Cannot get GPIO line %d, ret=%d\n", gpio_32khz, ret);
-		goto fail;
-	}
-
-	ret = gpio_direction_output(gpio_32khz, 1);
-	if (ret < 0) {
-		pr_err("Cannot set GPIO line %d, ret=%d\n", gpio_32khz, ret);
-		goto fail_direction;
+		return -ENODEV;
 	}
 
 	return 0;
-
-fail_direction:
-	gpio_free(gpio_32khz);
-fail:
-	return -ENODEV;
 }
 
 static struct twl4030_gpio_platform_data omap3pandora_gpio_data = {
@@ -544,15 +533,6 @@ static struct twl4030_platform_data omap3pandora_twldata = {
 	.bci		= &pandora_bci_data,
 };
 
-static struct i2c_board_info __initdata omap3pandora_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("tps65950", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &omap3pandora_twldata,
-	},
-};
-
 static struct i2c_board_info __initdata omap3pandora_i2c3_boardinfo[] = {
 	{
 		I2C_BOARD_INFO("bq27500", 0x55),
@@ -562,61 +542,15 @@ static struct i2c_board_info __initdata omap3pandora_i2c3_boardinfo[] = {
 
 static int __init omap3pandora_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, omap3pandora_i2c_boardinfo,
-			ARRAY_SIZE(omap3pandora_i2c_boardinfo));
+	omap3_pmic_init("tps65950", &omap3pandora_twldata);
 	/* i2c2 pins are not connected */
 	omap_register_i2c_bus(3, 100, omap3pandora_i2c3_boardinfo,
 			ARRAY_SIZE(omap3pandora_i2c3_boardinfo));
 	return 0;
 }
 
-static void __init omap3pandora_ads7846_init(void)
-{
-	int gpio = OMAP3_PANDORA_TS_GPIO;
-	int ret;
-
-	ret = gpio_request(gpio, "ads7846_pen_down");
-	if (ret < 0) {
-		printk(KERN_ERR "Failed to request GPIO %d for "
-				"ads7846 pen down IRQ\n", gpio);
-		return;
-	}
-
-	gpio_direction_input(gpio);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(OMAP3_PANDORA_TS_GPIO);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max			= 0x0fff,
-	.y_max			= 0x0fff,
-	.x_plate_ohms		= 180,
-	.pressure_max		= 255,
-	.debounce_max		= 10,
-	.debounce_tol		= 3,
-	.debounce_rep		= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-};
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
 static struct spi_board_info omap3pandora_spi_board_info[] __initdata = {
 	{
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(OMAP3_PANDORA_TS_GPIO),
-		.platform_data		= &ads7846_config,
-	}, {
 		.modalias		= "tpo_td043mtea1_panel_spi",
 		.bus_num		= 1,
 		.chip_select		= 1,
@@ -639,14 +573,10 @@ static void __init pandora_wl1251_init(void)
 
 	memset(&pandora_wl1251_pdata, 0, sizeof(pandora_wl1251_pdata));
 
-	ret = gpio_request(PANDORA_WIFI_IRQ_GPIO, "wl1251 irq");
+	ret = gpio_request_one(PANDORA_WIFI_IRQ_GPIO, GPIOF_IN, "wl1251 irq");
 	if (ret < 0)
 		goto fail;
 
-	ret = gpio_direction_input(PANDORA_WIFI_IRQ_GPIO);
-	if (ret < 0)
-		goto fail_irq;
-
 	pandora_wl1251_pdata.irq = gpio_to_irq(PANDORA_WIFI_IRQ_GPIO);
 	if (pandora_wl1251_pdata.irq < 0)
 		goto fail_irq;
@@ -688,12 +618,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static void __init omap3pandora_init(void)
 {
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
@@ -705,9 +629,9 @@ static void __init omap3pandora_init(void)
 	omap_serial_init();
 	spi_register_board_info(omap3pandora_spi_board_info,
 			ARRAY_SIZE(omap3pandora_spi_board_info));
-	omap3pandora_ads7846_init();
+	omap_ads7846_init(1, OMAP3_PANDORA_TS_GPIO, 0, NULL);
 	usbhs_init(&usbhs_bdata);
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	gpmc_nand_init(&pandora_nand_data);
 
 	/* Ensure SDRC pins are mux'd for self-refresh */
diff --git a/arch/arm/mach-omap2/board-omap3stalker.c b/arch/arm/mach-omap2/board-omap3stalker.c
index a72c90a..0c108a2 100644
--- a/arch/arm/mach-omap2/board-omap3stalker.c
+++ b/arch/arm/mach-omap2/board-omap3stalker.c
@@ -45,7 +45,6 @@
 #include <plat/mcspi.h>
 #include <linux/input/matrix_keypad.h>
 #include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
 #include <linux/interrupt.h>
 #include <linux/smsc911x.h>
 #include <linux/i2c/at24.h>
@@ -54,52 +53,28 @@
 #include "mux.h"
 #include "hsmmc.h"
 #include "timer-gp.h"
+#include "common-board-devices.h"
 
 #if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
+#include <plat/gpmc-smsc911x.h>
+
 #define OMAP3STALKER_ETHR_START	0x2c000000
 #define OMAP3STALKER_ETHR_SIZE	1024
 #define OMAP3STALKER_ETHR_GPIO_IRQ	19
 #define OMAP3STALKER_SMC911X_CS	5
 
-static struct resource omap3stalker_smsc911x_resources[] = {
-	[0] = {
-	       .start	= OMAP3STALKER_ETHR_START,
-	       .end	=
-	       (OMAP3STALKER_ETHR_START + OMAP3STALKER_ETHR_SIZE - 1),
-	       .flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-	       .start	= OMAP_GPIO_IRQ(OMAP3STALKER_ETHR_GPIO_IRQ),
-	       .end	= OMAP_GPIO_IRQ(OMAP3STALKER_ETHR_GPIO_IRQ),
-	       .flags	= (IORESOURCE_IRQ | IRQF_TRIGGER_LOW),
-	},
-};
-
-static struct smsc911x_platform_config smsc911x_config = {
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
+static struct omap_smsc911x_platform_data smsc911x_cfg = {
+	.cs             = OMAP3STALKER_SMC911X_CS,
+	.gpio_irq       = OMAP3STALKER_ETHR_GPIO_IRQ,
+	.gpio_reset     = -EINVAL,
 	.flags		= (SMSC911X_USE_32BIT | SMSC911X_SAVE_MAC_ADDRESS),
 };
 
-static struct platform_device omap3stalker_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(omap3stalker_smsc911x_resources),
-	.resource	= &omap3stalker_smsc911x_resources[0],
-	.dev		= {
-		.platform_data	= &smsc911x_config,
-	},
-};
-
 static inline void __init omap3stalker_init_eth(void)
 {
-	int eth_cs;
 	struct clk *l3ck;
 	unsigned int rate;
 
-	eth_cs = OMAP3STALKER_SMC911X_CS;
-
 	l3ck = clk_get(NULL, "l3_ck");
 	if (IS_ERR(l3ck))
 		rate = 100000000;
@@ -107,16 +82,7 @@ static inline void __init omap3stalker_init_eth(void)
 		rate = clk_get_rate(l3ck);
 
 	omap_mux_init_gpio(19, OMAP_PIN_INPUT_PULLUP);
-	if (gpio_request(OMAP3STALKER_ETHR_GPIO_IRQ, "SMC911x irq") < 0) {
-		printk(KERN_ERR
-		       "Failed to request GPIO%d for smc911x IRQ\n",
-		       OMAP3STALKER_ETHR_GPIO_IRQ);
-		return;
-	}
-
-	gpio_direction_input(OMAP3STALKER_ETHR_GPIO_IRQ);
-
-	platform_device_register(&omap3stalker_smsc911x_device);
+	gpmc_smsc911x_init(&smsc911x_cfg);
 }
 
 #else
@@ -365,12 +331,11 @@ omap3stalker_twl_gpio_setup(struct device *dev,
 	 */
 
 	/* TWL4030_GPIO_MAX + 0 == ledA, LCD Backlight control */
-	gpio_request(gpio + TWL4030_GPIO_MAX, "EN_LCD_BKL");
-	gpio_direction_output(gpio + TWL4030_GPIO_MAX, 0);
+	gpio_request_one(gpio + TWL4030_GPIO_MAX, GPIOF_OUT_INIT_LOW,
+			 "EN_LCD_BKL");
 
 	/* gpio + 7 == DVI Enable */
-	gpio_request(gpio + 7, "EN_DVI");
-	gpio_direction_output(gpio + 7, 0);
+	gpio_request_one(gpio + 7, GPIOF_OUT_INIT_LOW, "EN_DVI");
 
 	/* TWL4030_GPIO_MAX + 1 == ledB (out, mmc0) */
 	gpio_leds[2].gpio = gpio + TWL4030_GPIO_MAX + 1;
@@ -489,15 +454,8 @@ static struct twl4030_platform_data omap3stalker_twldata = {
 	.codec		= &omap3stalker_codec_data,
 	.vdac		= &omap3_stalker_vdac,
 	.vpll2		= &omap3_stalker_vpll2,
-};
-
-static struct i2c_board_info __initdata omap3stalker_i2c_boardinfo[] = {
-	{
-	 I2C_BOARD_INFO("twl4030", 0x48),
-	 .flags		= I2C_CLIENT_WAKE,
-	 .irq		= INT_34XX_SYS_NIRQ,
-	 .platform_data	= &omap3stalker_twldata,
-	 },
+	.vmmc1		= &omap3stalker_vmmc1,
+	.vsim		= &omap3stalker_vsim,
 };
 
 static struct at24_platform_data fram_info = {
@@ -516,15 +474,7 @@ static struct i2c_board_info __initdata omap3stalker_i2c_boardinfo3[] = {
 
 static int __init omap3_stalker_i2c_init(void)
 {
-	/*
-	 * REVISIT: These entries can be set in omap3evm_twl_data
-	 * after a merge with MFD tree
-	 */
-	omap3stalker_twldata.vmmc1 = &omap3stalker_vmmc1;
-	omap3stalker_twldata.vsim = &omap3stalker_vsim;
-
-	omap_register_i2c_bus(1, 2600, omap3stalker_i2c_boardinfo,
-			      ARRAY_SIZE(omap3stalker_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &omap3stalker_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, omap3stalker_i2c_boardinfo3,
 			      ARRAY_SIZE(omap3stalker_i2c_boardinfo3));
@@ -532,49 +482,6 @@ static int __init omap3_stalker_i2c_init(void)
 }
 
 #define OMAP3_STALKER_TS_GPIO	175
-static void ads7846_dev_init(void)
-{
-	if (gpio_request(OMAP3_STALKER_TS_GPIO, "ADS7846 pendown") < 0)
-		printk(KERN_ERR "can't get ads7846 pen down GPIO\n");
-
-	gpio_direction_input(OMAP3_STALKER_TS_GPIO);
-	gpio_set_debounce(OMAP3_STALKER_TS_GPIO, 310);
-}
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(OMAP3_STALKER_TS_GPIO);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max			= 0x0fff,
-	.y_max			= 0x0fff,
-	.x_plate_ohms		= 180,
-	.pressure_max		= 255,
-	.debounce_max		= 10,
-	.debounce_tol		= 3,
-	.debounce_rep		= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-	.settle_delay_usecs	= 150,
-};
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode		= 0,
-	.single_channel		= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info omap3stalker_spi_board_info[] = {
-	[0] = {
-	       .modalias	= "ads7846",
-	       .bus_num		= 1,
-	       .chip_select	= 0,
-	       .max_speed_hz	= 1500000,
-	       .controller_data	= &ads7846_mcspi_config,
-	       .irq		= OMAP_GPIO_IRQ(OMAP3_STALKER_TS_GPIO),
-	       .platform_data	= &ads7846_config,
-	},
-};
 
 static struct omap_board_config_kernel omap3_stalker_config[] __initdata = {
 };
@@ -618,12 +525,6 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type	= MUSB_INTERFACE_ULPI,
-	.mode		= MUSB_OTG,
-	.power		= 100,
-};
-
 static void __init omap3_stalker_init(void)
 {
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CUS);
@@ -636,13 +537,11 @@ static void __init omap3_stalker_init(void)
 			     ARRAY_SIZE(omap3_stalker_devices));
 
 	omap_display_init(&omap3_stalker_dss_data);
-	spi_register_board_info(omap3stalker_spi_board_info,
-				ARRAY_SIZE(omap3stalker_spi_board_info));
 
 	omap_serial_init();
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
-	ads7846_dev_init();
+	omap_ads7846_init(1, OMAP3_STALKER_TS_GPIO, 310, NULL);
 
 	omap_mux_init_gpio(21, OMAP_PIN_OUTPUT);
 	omap_mux_init_gpio(18, OMAP_PIN_INPUT_PULLUP);
diff --git a/arch/arm/mach-omap2/board-omap3touchbook.c b/arch/arm/mach-omap2/board-omap3touchbook.c
index 127cb17..82872d7 100644
--- a/arch/arm/mach-omap2/board-omap3touchbook.c
+++ b/arch/arm/mach-omap2/board-omap3touchbook.c
@@ -52,6 +52,7 @@
 #include "mux.h"
 #include "hsmmc.h"
 #include "timer-gp.h"
+#include "common-board-devices.h"
 
 #include <asm/setup.h>
 
@@ -95,15 +96,6 @@ static struct mtd_partition omap3touchbook_nand_partitions[] = {
 	},
 };
 
-static struct omap_nand_platform_data omap3touchbook_nand_data = {
-	.options	= NAND_BUSWIDTH_16,
-	.parts		= omap3touchbook_nand_partitions,
-	.nr_parts	= ARRAY_SIZE(omap3touchbook_nand_partitions),
-	.dma_channel	= -1,		/* disable DMA in OMAP NAND driver */
-	.nand_setup	= NULL,
-	.dev_ready	= NULL,
-};
-
 #include "sdram-micron-mt46h32m32lf-6.h"
 
 static struct omap2_hsmmc_info mmc[] = {
@@ -154,13 +146,11 @@ static int touchbook_twl_gpio_setup(struct device *dev,
 	/* REVISIT: need ehci-omap hooks for external VBUS
 	 * power switch and overcurrent detect
 	 */
-
-	gpio_request(gpio + 1, "EHCI_nOC");
-	gpio_direction_input(gpio + 1);
+	gpio_request_one(gpio + 1, GPIOF_IN, "EHCI_nOC");
 
 	/* TWL4030_GPIO_MAX + 0 == ledA, EHCI nEN_USB_PWR (out, active low) */
-	gpio_request(gpio + TWL4030_GPIO_MAX, "nEN_USB_PWR");
-	gpio_direction_output(gpio + TWL4030_GPIO_MAX, 0);
+	gpio_request_one(gpio + TWL4030_GPIO_MAX, GPIOF_OUT_INIT_LOW,
+			 "nEN_USB_PWR");
 
 	/* TWL4030_GPIO_MAX + 1 == ledB, PMU_STAT (out, active low LED) */
 	gpio_leds[2].gpio = gpio + TWL4030_GPIO_MAX + 1;
@@ -273,15 +263,6 @@ static struct twl4030_platform_data touchbook_twldata = {
 	.vpll2		= &touchbook_vpll2,
 };
 
-static struct i2c_board_info __initdata touchbook_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl4030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &touchbook_twldata,
-	},
-};
-
 static struct i2c_board_info __initdata touchBook_i2c_boardinfo[] = {
 	{
 		I2C_BOARD_INFO("bq27200", 0x55),
@@ -291,8 +272,7 @@ static struct i2c_board_info __initdata touchBook_i2c_boardinfo[] = {
 static int __init omap3_touchbook_i2c_init(void)
 {
 	/* Standard TouchBook bus */
-	omap_register_i2c_bus(1, 2600, touchbook_i2c_boardinfo,
-			ARRAY_SIZE(touchbook_i2c_boardinfo));
+	omap3_pmic_init("twl4030", &touchbook_twldata);
 
 	/* Additional TouchBook bus */
 	omap_register_i2c_bus(3, 100, touchBook_i2c_boardinfo,
@@ -301,19 +281,7 @@ static int __init omap3_touchbook_i2c_init(void)
 	return 0;
 }
 
-static void __init omap3_ads7846_init(void)
-{
-	if (gpio_request(OMAP3_TS_GPIO, "ads7846_pen_down")) {
-		printk(KERN_ERR "Failed to request GPIO %d for "
-				"ads7846 pen down IRQ\n", OMAP3_TS_GPIO);
-		return;
-	}
-
-	gpio_direction_input(OMAP3_TS_GPIO);
-	gpio_set_debounce(OMAP3_TS_GPIO, 310);
-}
-
-static struct ads7846_platform_data ads7846_config = {
+static struct ads7846_platform_data ads7846_pdata = {
 	.x_min			= 100,
 	.y_min			= 265,
 	.x_max			= 3950,
@@ -327,23 +295,6 @@ static struct ads7846_platform_data ads7846_config = {
 	.keep_vref_on		= 1,
 };
 
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static struct spi_board_info omap3_ads7846_spi_board_info[] __initdata = {
-	{
-		.modalias		= "ads7846",
-		.bus_num		= 4,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(OMAP3_TS_GPIO),
-		.platform_data		= &ads7846_config,
-	}
-};
-
 static struct gpio_led gpio_leds[] = {
 	{
 		.name			= "touchbook::usr0",
@@ -434,39 +385,6 @@ static struct platform_device *omap3_touchbook_devices[] __initdata = {
 	&keys_gpio,
 };
 
-static void __init omap3touchbook_flash_init(void)
-{
-	u8 cs = 0;
-	u8 nandcs = GPMC_CS_NUM + 1;
-
-	/* find out the chip-select on which NAND exists */
-	while (cs < GPMC_CS_NUM) {
-		u32 ret = 0;
-		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
-
-		if ((ret & 0xC00) == 0x800) {
-			printk(KERN_INFO "Found NAND on CS%d\n", cs);
-			if (nandcs > GPMC_CS_NUM)
-				nandcs = cs;
-		}
-		cs++;
-	}
-
-	if (nandcs > GPMC_CS_NUM) {
-		printk(KERN_INFO "NAND: Unable to find configuration "
-				 "in GPMC\n ");
-		return;
-	}
-
-	if (nandcs < GPMC_CS_NUM) {
-		omap3touchbook_nand_data.cs = nandcs;
-
-		printk(KERN_INFO "Registering NAND on CS%d\n", nandcs);
-		if (gpmc_nand_init(&omap3touchbook_nand_data) < 0)
-			printk(KERN_ERR "Unable to register NAND device\n");
-	}
-}
-
 static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 
 	.port_mode[0] = OMAP_EHCI_PORT_MODE_PHY,
@@ -481,15 +399,10 @@ static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 
 static void omap3_touchbook_poweroff(void)
 {
-	int r;
+	int pwr_off = TB_KILL_POWER_GPIO;
 
-	r = gpio_request(TB_KILL_POWER_GPIO, "DVI reset");
-	if (r < 0) {
+	if (gpio_request_one(pwr_off, GPIOF_OUT_INIT_LOW, "DVI reset") < 0)
 		printk(KERN_ERR "Unable to get kill power GPIO\n");
-		return;
-	}
-
-	gpio_direction_output(TB_KILL_POWER_GPIO, 0);
 }
 
 static int __init early_touchbook_revision(char *p)
@@ -501,12 +414,6 @@ static int __init early_touchbook_revision(char *p)
 }
 early_param("tbr", early_touchbook_revision);
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static void __init omap3_touchbook_init(void)
 {
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
@@ -521,17 +428,15 @@ static void __init omap3_touchbook_init(void)
 	omap_serial_init();
 
 	omap_mux_init_gpio(170, OMAP_PIN_INPUT);
-	gpio_request(176, "DVI_nPD");
 	/* REVISIT leave DVI powered down until it's needed ... */
-	gpio_direction_output(176, true);
+	gpio_request_one(176, GPIOF_OUT_INIT_HIGH, "DVI_nPD");
 
 	/* Touchscreen and accelerometer */
-	spi_register_board_info(omap3_ads7846_spi_board_info,
-				ARRAY_SIZE(omap3_ads7846_spi_board_info));
-	omap3_ads7846_init();
-	usb_musb_init(&musb_board_data);
+	omap_ads7846_init(4, OMAP3_TS_GPIO, 310, &ads7846_pdata);
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
-	omap3touchbook_flash_init();
+	omap_nand_flash_init(NAND_BUSWIDTH_16, omap3touchbook_nand_partitions,
+			     ARRAY_SIZE(omap3touchbook_nand_partitions));
 
 	/* Ensure SDRC pins are mux'd for self-refresh */
 	omap_mux_init_signal("sdrc_cke0", OMAP_PIN_OUTPUT);
diff --git a/arch/arm/mach-omap2/board-omap4panda.c b/arch/arm/mach-omap2/board-omap4panda.c
index e4973ac..90485fc 100644
--- a/arch/arm/mach-omap2/board-omap4panda.c
+++ b/arch/arm/mach-omap2/board-omap4panda.c
@@ -46,6 +46,7 @@
 #include "hsmmc.h"
 #include "control.h"
 #include "mux.h"
+#include "common-board-devices.h"
 
 #define GPIO_HUB_POWER		1
 #define GPIO_HUB_NRESET		62
@@ -111,6 +112,11 @@ static const struct usbhs_omap_board_data usbhs_bdata __initconst = {
 	.reset_gpio_port[2]  = -EINVAL
 };
 
+static struct gpio panda_ehci_gpios[] __initdata = {
+	{ GPIO_HUB_POWER,	GPIOF_OUT_INIT_LOW,  "hub_power"  },
+	{ GPIO_HUB_NRESET,	GPIOF_OUT_INIT_LOW,  "hub_nreset" },
+};
+
 static void __init omap4_ehci_init(void)
 {
 	int ret;
@@ -120,44 +126,27 @@ static void __init omap4_ehci_init(void)
 	phy_ref_clk = clk_get(NULL, "auxclk3_ck");
 	if (IS_ERR(phy_ref_clk)) {
 		pr_err("Cannot request auxclk3\n");
-		goto error1;
+		return;
 	}
 	clk_set_rate(phy_ref_clk, 19200000);
 	clk_enable(phy_ref_clk);
 
-	/* disable the power to the usb hub prior to init */
-	ret = gpio_request(GPIO_HUB_POWER, "hub_power");
+	/* disable the power to the usb hub prior to init and reset phy+hub */
+	ret = gpio_request_array(panda_ehci_gpios,
+				 ARRAY_SIZE(panda_ehci_gpios));
 	if (ret) {
-		pr_err("Cannot request GPIO %d\n", GPIO_HUB_POWER);
-		goto error1;
+		pr_err("Unable to initialize EHCI power/reset\n");
+		return;
 	}
-	gpio_export(GPIO_HUB_POWER, 0);
-	gpio_direction_output(GPIO_HUB_POWER, 0);
-	gpio_set_value(GPIO_HUB_POWER, 0);
 
-	/* reset phy+hub */
-	ret = gpio_request(GPIO_HUB_NRESET, "hub_nreset");
-	if (ret) {
-		pr_err("Cannot request GPIO %d\n", GPIO_HUB_NRESET);
-		goto error2;
-	}
+	gpio_export(GPIO_HUB_POWER, 0);
 	gpio_export(GPIO_HUB_NRESET, 0);
-	gpio_direction_output(GPIO_HUB_NRESET, 0);
-	gpio_set_value(GPIO_HUB_NRESET, 0);
 	gpio_set_value(GPIO_HUB_NRESET, 1);
 
 	usbhs_init(&usbhs_bdata);
 
 	/* enable power to hub */
 	gpio_set_value(GPIO_HUB_POWER, 1);
-	return;
-
-error2:
-	gpio_free(GPIO_HUB_POWER);
-error1:
-	pr_err("Unable to initialize EHCI power/reset\n");
-	return;
-
 }
 
 static struct omap_musb_board_data musb_board_data = {
@@ -408,15 +397,6 @@ static struct twl4030_platform_data omap4_panda_twldata = {
 	.usb		= &omap4_usbphy_data,
 };
 
-static struct i2c_board_info __initdata omap4_panda_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl6030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = OMAP44XX_IRQ_SYS_1N,
-		.platform_data = &omap4_panda_twldata,
-	},
-};
-
 /*
  * Display monitor features are burnt in their EEPROM as EDID data. The EEPROM
  * is connected as I2C slave device, and can be accessed at address 0x50
@@ -429,12 +409,7 @@ static struct i2c_board_info __initdata panda_i2c_eeprom[] = {
 
 static int __init omap4_panda_i2c_init(void)
 {
-	/*
-	 * Phoenix Audio IC needs I2C1 to
-	 * start with 400 KHz or less
-	 */
-	omap_register_i2c_bus(1, 400, omap4_panda_i2c_boardinfo,
-			ARRAY_SIZE(omap4_panda_i2c_boardinfo));
+	omap4_pmic_init("twl6030", &omap4_panda_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	/*
 	 * Bus 3 is attached to the DVI port where devices like the pico DLP
@@ -651,27 +626,19 @@ static void omap4_panda_hdmi_mux_init(void)
 			OMAP_PIN_INPUT_PULLUP);
 }
 
+static struct gpio panda_hdmi_gpios[] = {
+	{ HDMI_GPIO_HPD,	GPIOF_OUT_INIT_HIGH, "hdmi_gpio_hpd"   },
+	{ HDMI_GPIO_LS_OE,	GPIOF_OUT_INIT_HIGH, "hdmi_gpio_ls_oe" },
+};
+
 static int omap4_panda_panel_enable_hdmi(struct omap_dss_device *dssdev)
 {
 	int status;
 
-	status = gpio_request_one(HDMI_GPIO_HPD, GPIOF_OUT_INIT_HIGH,
-							"hdmi_gpio_hpd");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", HDMI_GPIO_HPD);
-		return status;
-	}
-	status = gpio_request_one(HDMI_GPIO_LS_OE, GPIOF_OUT_INIT_HIGH,
-							"hdmi_gpio_ls_oe");
-	if (status) {
-		pr_err("Cannot request GPIO %d\n", HDMI_GPIO_LS_OE);
-		goto error1;
-	}
-
-	return 0;
-
-error1:
-	gpio_free(HDMI_GPIO_HPD);
+	status = gpio_request_array(panda_hdmi_gpios,
+				    ARRAY_SIZE(panda_hdmi_gpios));
+	if (status)
+		pr_err("Cannot request HDMI GPIOs\n");
 
 	return status;
 }
diff --git a/arch/arm/mach-omap2/board-overo.c b/arch/arm/mach-omap2/board-overo.c
index 9d192ff..1555918 100644
--- a/arch/arm/mach-omap2/board-overo.c
+++ b/arch/arm/mach-omap2/board-overo.c
@@ -56,6 +56,7 @@
 #include "mux.h"
 #include "sdram-micron-mt46h32m32lf-6.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define OVERO_GPIO_BT_XGATE	15
 #define OVERO_GPIO_W2W_NRESET	16
@@ -74,30 +75,6 @@
 #if defined(CONFIG_TOUCHSCREEN_ADS7846) || \
 	defined(CONFIG_TOUCHSCREEN_ADS7846_MODULE)
 
-#include <linux/spi/ads7846.h>
-
-static struct omap2_mcspi_device_config ads7846_mcspi_config = {
-	.turbo_mode	= 0,
-	.single_channel	= 1,	/* 0: slave, 1: master */
-};
-
-static int ads7846_get_pendown_state(void)
-{
-	return !gpio_get_value(OVERO_GPIO_PENDOWN);
-}
-
-static struct ads7846_platform_data ads7846_config = {
-	.x_max			= 0x0fff,
-	.y_max			= 0x0fff,
-	.x_plate_ohms		= 180,
-	.pressure_max		= 255,
-	.debounce_max		= 10,
-	.debounce_tol		= 3,
-	.debounce_rep		= 1,
-	.get_pendown_state	= ads7846_get_pendown_state,
-	.keep_vref_on		= 1,
-};
-
 /* fixed regulator for ads7846 */
 static struct regulator_consumer_supply ads7846_supply =
 	REGULATOR_SUPPLY("vcc", "spi1.0");
@@ -128,14 +105,7 @@ static struct platform_device vads7846_device = {
 
 static void __init overo_ads7846_init(void)
 {
-	if ((gpio_request(OVERO_GPIO_PENDOWN, "ADS7846_PENDOWN") == 0) &&
-	    (gpio_direction_input(OVERO_GPIO_PENDOWN) == 0)) {
-		gpio_export(OVERO_GPIO_PENDOWN, 0);
-	} else {
-		printk(KERN_ERR "could not obtain gpio for ADS7846_PENDOWN\n");
-		return;
-	}
-
+	omap_ads7846_init(1, OVERO_GPIO_PENDOWN, 0, NULL);
 	platform_device_register(&vads7846_device);
 }
 
@@ -146,106 +116,28 @@ static inline void __init overo_ads7846_init(void) { return; }
 #if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
 
 #include <linux/smsc911x.h>
+#include <plat/gpmc-smsc911x.h>
 
-static struct resource overo_smsc911x_resources[] = {
-	{
-		.name	= "smsc911x-memory",
-		.flags	= IORESOURCE_MEM,
-	},
-	{
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct resource overo_smsc911x2_resources[] = {
-	{
-		.name	= "smsc911x2-memory",
-		.flags	= IORESOURCE_MEM,
-	},
-	{
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct smsc911x_platform_config overo_smsc911x_config = {
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
-	.flags		= SMSC911X_USE_32BIT ,
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-};
-
-static struct platform_device overo_smsc911x_device = {
-	.name		= "smsc911x",
+static struct omap_smsc911x_platform_data smsc911x_cfg = {
 	.id		= 0,
-	.num_resources	= ARRAY_SIZE(overo_smsc911x_resources),
-	.resource	= overo_smsc911x_resources,
-	.dev		= {
-		.platform_data = &overo_smsc911x_config,
-	},
+	.cs             = OVERO_SMSC911X_CS,
+	.gpio_irq       = OVERO_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT,
 };
 
-static struct platform_device overo_smsc911x2_device = {
-	.name		= "smsc911x",
+static struct omap_smsc911x_platform_data smsc911x2_cfg = {
 	.id		= 1,
-	.num_resources	= ARRAY_SIZE(overo_smsc911x2_resources),
-	.resource	= overo_smsc911x2_resources,
-	.dev		= {
-		.platform_data = &overo_smsc911x_config,
-	},
+	.cs             = OVERO_SMSC911X2_CS,
+	.gpio_irq       = OVERO_SMSC911X2_GPIO,
+	.gpio_reset     = -EINVAL,
+	.flags		= SMSC911X_USE_32BIT,
 };
 
-static struct platform_device *smsc911x_devices[] = {
-	&overo_smsc911x_device,
-	&overo_smsc911x2_device,
-};
-
-static inline void __init overo_init_smsc911x(void)
+static void __init overo_init_smsc911x(void)
 {
-	unsigned long cs_mem_base, cs_mem_base2;
-
-	/* set up first smsc911x chip */
-
-	if (gpmc_cs_request(OVERO_SMSC911X_CS, SZ_16M, &cs_mem_base) < 0) {
-		printk(KERN_ERR "Failed request for GPMC mem for smsc911x\n");
-		return;
-	}
-
-	overo_smsc911x_resources[0].start = cs_mem_base + 0x0;
-	overo_smsc911x_resources[0].end   = cs_mem_base + 0xff;
-
-	if ((gpio_request(OVERO_SMSC911X_GPIO, "SMSC911X IRQ") == 0) &&
-	    (gpio_direction_input(OVERO_SMSC911X_GPIO) == 0)) {
-		gpio_export(OVERO_SMSC911X_GPIO, 0);
-	} else {
-		printk(KERN_ERR "could not obtain gpio for SMSC911X IRQ\n");
-		return;
-	}
-
-	overo_smsc911x_resources[1].start = OMAP_GPIO_IRQ(OVERO_SMSC911X_GPIO);
-	overo_smsc911x_resources[1].end	  = 0;
-
-	/* set up second smsc911x chip */
-
-	if (gpmc_cs_request(OVERO_SMSC911X2_CS, SZ_16M, &cs_mem_base2) < 0) {
-		printk(KERN_ERR "Failed request for GPMC mem for smsc911x2\n");
-		return;
-	}
-
-	overo_smsc911x2_resources[0].start = cs_mem_base2 + 0x0;
-	overo_smsc911x2_resources[0].end   = cs_mem_base2 + 0xff;
-
-	if ((gpio_request(OVERO_SMSC911X2_GPIO, "SMSC911X2 IRQ") == 0) &&
-	    (gpio_direction_input(OVERO_SMSC911X2_GPIO) == 0)) {
-		gpio_export(OVERO_SMSC911X2_GPIO, 0);
-	} else {
-		printk(KERN_ERR "could not obtain gpio for SMSC911X2 IRQ\n");
-		return;
-	}
-
-	overo_smsc911x2_resources[1].start = OMAP_GPIO_IRQ(OVERO_SMSC911X2_GPIO);
-	overo_smsc911x2_resources[1].end   = 0;
-
-	platform_add_devices(smsc911x_devices, ARRAY_SIZE(smsc911x_devices));
+	gpmc_smsc911x_init(&smsc911x_cfg);
+	gpmc_smsc911x_init(&smsc911x2_cfg);
 }
 
 #else
@@ -259,21 +151,20 @@ static int dvi_enabled;
 #define OVERO_GPIO_LCD_EN 144
 #define OVERO_GPIO_LCD_BL 145
 
+static struct gpio overo_dss_gpios[] __initdata = {
+	{ OVERO_GPIO_LCD_EN, GPIOF_OUT_INIT_HIGH, "OVERO_GPIO_LCD_EN" },
+	{ OVERO_GPIO_LCD_BL, GPIOF_OUT_INIT_HIGH, "OVERO_GPIO_LCD_BL" },
+};
+
 static void __init overo_display_init(void)
 {
-	if ((gpio_request(OVERO_GPIO_LCD_EN, "OVERO_GPIO_LCD_EN") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_LCD_EN, 1) == 0))
-		gpio_export(OVERO_GPIO_LCD_EN, 0);
-	else
-		printk(KERN_ERR "could not obtain gpio for "
-					"OVERO_GPIO_LCD_EN\n");
+	if (gpio_request_array(overo_dss_gpios, ARRAY_SIZE(overo_dss_gpios))) {
+		printk(KERN_ERR "could not obtain DSS control GPIOs\n");
+		return;
+	}
 
-	if ((gpio_request(OVERO_GPIO_LCD_BL, "OVERO_GPIO_LCD_BL") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_LCD_BL, 1) == 0))
-		gpio_export(OVERO_GPIO_LCD_BL, 0);
-	else
-		printk(KERN_ERR "could not obtain gpio for "
-					"OVERO_GPIO_LCD_BL\n");
+	gpio_export(OVERO_GPIO_LCD_EN, 0);
+	gpio_export(OVERO_GPIO_LCD_BL, 0);
 }
 
 static int overo_panel_enable_dvi(struct omap_dss_device *dssdev)
@@ -412,45 +303,6 @@ static struct mtd_partition overo_nand_partitions[] = {
 	},
 };
 
-static struct omap_nand_platform_data overo_nand_data = {
-	.parts = overo_nand_partitions,
-	.nr_parts = ARRAY_SIZE(overo_nand_partitions),
-	.dma_channel = -1,	/* disable DMA in OMAP NAND driver */
-};
-
-static void __init overo_flash_init(void)
-{
-	u8 cs = 0;
-	u8 nandcs = GPMC_CS_NUM + 1;
-
-	/* find out the chip-select on which NAND exists */
-	while (cs < GPMC_CS_NUM) {
-		u32 ret = 0;
-		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
-
-		if ((ret & 0xC00) == 0x800) {
-			printk(KERN_INFO "Found NAND on CS%d\n", cs);
-			if (nandcs > GPMC_CS_NUM)
-				nandcs = cs;
-		}
-		cs++;
-	}
-
-	if (nandcs > GPMC_CS_NUM) {
-		printk(KERN_INFO "NAND: Unable to find configuration "
-				 "in GPMC\n ");
-		return;
-	}
-
-	if (nandcs < GPMC_CS_NUM) {
-		overo_nand_data.cs = nandcs;
-
-		printk(KERN_INFO "Registering NAND on CS%d\n", nandcs);
-		if (gpmc_nand_init(&overo_nand_data) < 0)
-			printk(KERN_ERR "Unable to register NAND device\n");
-	}
-}
-
 static struct omap2_hsmmc_info mmc[] = {
 	{
 		.mmc		= 1,
@@ -648,37 +500,15 @@ static struct twl4030_platform_data overo_twldata = {
 	.vpll2		= &overo_vpll2,
 };
 
-static struct i2c_board_info __initdata overo_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("tps65950", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &overo_twldata,
-	},
-};
-
 static int __init overo_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2600, overo_i2c_boardinfo,
-			ARRAY_SIZE(overo_i2c_boardinfo));
+	omap3_pmic_init("tps65950", &overo_twldata);
 	/* i2c2 pins are used for gpio */
 	omap_register_i2c_bus(3, 400, NULL, 0);
 	return 0;
 }
 
 static struct spi_board_info overo_spi_board_info[] __initdata = {
-#if defined(CONFIG_TOUCHSCREEN_ADS7846) || \
-	defined(CONFIG_TOUCHSCREEN_ADS7846_MODULE)
-	{
-		.modalias		= "ads7846",
-		.bus_num		= 1,
-		.chip_select		= 0,
-		.max_speed_hz		= 1500000,
-		.controller_data	= &ads7846_mcspi_config,
-		.irq			= OMAP_GPIO_IRQ(OVERO_GPIO_PENDOWN),
-		.platform_data		= &ads7846_config,
-	},
-#endif
 #if defined(CONFIG_PANEL_LGPHILIPS_LB035Q02) || \
 	defined(CONFIG_PANEL_LGPHILIPS_LB035Q02_MODULE)
 	{
@@ -722,20 +552,22 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
+static struct gpio overo_bt_gpios[] __initdata = {
+	{ OVERO_GPIO_BT_XGATE,	GPIOF_OUT_INIT_LOW,	"lcd enable"    },
+	{ OVERO_GPIO_BT_NRESET, GPIOF_OUT_INIT_HIGH,	"lcd bl enable" },
 };
 
 static void __init overo_init(void)
 {
+	int ret;
+
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
 	overo_i2c_init();
 	omap_display_init(&overo_dss_data);
 	omap_serial_init();
-	overo_flash_init();
-	usb_musb_init(&musb_board_data);
+	omap_nand_flash_init(0, overo_nand_partitions,
+			     ARRAY_SIZE(overo_nand_partitions));
+	usb_musb_init(NULL);
 	usbhs_init(&usbhs_bdata);
 	overo_spi_init();
 	overo_ads7846_init();
@@ -748,9 +580,9 @@ static void __init overo_init(void)
 	omap_mux_init_signal("sdrc_cke0", OMAP_PIN_OUTPUT);
 	omap_mux_init_signal("sdrc_cke1", OMAP_PIN_OUTPUT);
 
-	if ((gpio_request(OVERO_GPIO_W2W_NRESET,
-			  "OVERO_GPIO_W2W_NRESET") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_W2W_NRESET, 1) == 0)) {
+	ret = gpio_request_one(OVERO_GPIO_W2W_NRESET, GPIOF_OUT_INIT_HIGH,
+			       "OVERO_GPIO_W2W_NRESET");
+	if (ret == 0) {
 		gpio_export(OVERO_GPIO_W2W_NRESET, 0);
 		gpio_set_value(OVERO_GPIO_W2W_NRESET, 0);
 		udelay(10);
@@ -760,25 +592,20 @@ static void __init overo_init(void)
 					"OVERO_GPIO_W2W_NRESET\n");
 	}
 
-	if ((gpio_request(OVERO_GPIO_BT_XGATE, "OVERO_GPIO_BT_XGATE") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_BT_XGATE, 0) == 0))
+	ret = gpio_request_array(overo_bt_gpios, ARRAY_SIZE(overo_bt_gpios));
+	if (ret) {
+		pr_err("%s: could not obtain BT gpios\n", __func__);
+	} else {
 		gpio_export(OVERO_GPIO_BT_XGATE, 0);
-	else
-		printk(KERN_ERR "could not obtain gpio for OVERO_GPIO_BT_XGATE\n");
-
-	if ((gpio_request(OVERO_GPIO_BT_NRESET, "OVERO_GPIO_BT_NRESET") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_BT_NRESET, 1) == 0)) {
 		gpio_export(OVERO_GPIO_BT_NRESET, 0);
 		gpio_set_value(OVERO_GPIO_BT_NRESET, 0);
 		mdelay(6);
 		gpio_set_value(OVERO_GPIO_BT_NRESET, 1);
-	} else {
-		printk(KERN_ERR "could not obtain gpio for "
-					"OVERO_GPIO_BT_NRESET\n");
 	}
 
-	if ((gpio_request(OVERO_GPIO_USBH_CPEN, "OVERO_GPIO_USBH_CPEN") == 0) &&
-	    (gpio_direction_output(OVERO_GPIO_USBH_CPEN, 1) == 0))
+	ret = gpio_request_one(OVERO_GPIO_USBH_CPEN, GPIOF_OUT_INIT_HIGH,
+			       "OVERO_GPIO_USBH_CPEN");
+	if (ret == 0)
 		gpio_export(OVERO_GPIO_USBH_CPEN, 0);
 	else
 		printk(KERN_ERR "could not obtain gpio for "
diff --git a/arch/arm/mach-omap2/board-rm680.c b/arch/arm/mach-omap2/board-rm680.c
index 2af8b05..42d10b1 100644
--- a/arch/arm/mach-omap2/board-rm680.c
+++ b/arch/arm/mach-omap2/board-rm680.c
@@ -31,6 +31,7 @@
 #include "mux.h"
 #include "hsmmc.h"
 #include "sdram-nokia.h"
+#include "common-board-devices.h"
 
 static struct regulator_consumer_supply rm680_vemmc_consumers[] = {
 	REGULATOR_SUPPLY("vmmc", "omap_hsmmc.1"),
@@ -90,19 +91,9 @@ static struct twl4030_platform_data rm680_twl_data = {
 	/* add rest of the children here */
 };
 
-static struct i2c_board_info __initdata rm680_twl_i2c_board_info[] = {
-	{
-		I2C_BOARD_INFO("twl5031", 0x48),
-		.flags		= I2C_CLIENT_WAKE,
-		.irq		= INT_34XX_SYS_NIRQ,
-		.platform_data	= &rm680_twl_data,
-	},
-};
-
 static void __init rm680_i2c_init(void)
 {
-	omap_register_i2c_bus(1, 2900, rm680_twl_i2c_board_info,
-				ARRAY_SIZE(rm680_twl_i2c_board_info));
+	omap_pmic_init(1, 2900, "twl5031", INT_34XX_SYS_NIRQ, &rm680_twl_data);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, NULL, 0);
 }
@@ -153,17 +144,11 @@ static struct omap_board_mux board_mux[] __initdata = {
 };
 #endif
 
-static struct omap_musb_board_data rm680_musb_data = {
-	.interface_type	= MUSB_INTERFACE_ULPI,
-	.mode		= MUSB_PERIPHERAL,
-	.power		= 100,
-};
-
 static void __init rm680_init(void)
 {
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
 	omap_serial_init();
-	usb_musb_init(&rm680_musb_data);
+	usb_musb_init(NULL);
 	rm680_peripherals_init();
 }
 
diff --git a/arch/arm/mach-omap2/board-rx51-peripherals.c b/arch/arm/mach-omap2/board-rx51-peripherals.c
index 01ee0a1..2b00f72 100644
--- a/arch/arm/mach-omap2/board-rx51-peripherals.c
+++ b/arch/arm/mach-omap2/board-rx51-peripherals.c
@@ -43,6 +43,7 @@
 
 #include "mux.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define SYSTEM_REV_B_USES_VAUX3	0x1699
 #define SYSTEM_REV_S_USES_VAUX3 0x8
@@ -557,10 +558,8 @@ static __init void rx51_init_si4713(void)
 static int rx51_twlgpio_setup(struct device *dev, unsigned gpio, unsigned n)
 {
 	/* FIXME this gpio setup is just a placeholder for now */
-	gpio_request(gpio + 6, "backlight_pwm");
-	gpio_direction_output(gpio + 6, 0);
-	gpio_request(gpio + 7, "speaker_en");
-	gpio_direction_output(gpio + 7, 1);
+	gpio_request_one(gpio + 6, GPIOF_OUT_INIT_LOW, "backlight_pwm");
+	gpio_request_one(gpio + 7, GPIOF_OUT_INIT_HIGH, "speaker_en");
 
 	return 0;
 }
@@ -777,15 +776,6 @@ static struct tpa6130a2_platform_data rx51_tpa6130a2_data __initdata_or_module =
 	.power_gpio		= 98,
 };
 
-static struct i2c_board_info __initdata rx51_peripherals_i2c_board_info_1[] = {
-	{
-		I2C_BOARD_INFO("twl5030", 0x48),
-		.flags = I2C_CLIENT_WAKE,
-		.irq = INT_34XX_SYS_NIRQ,
-		.platform_data = &rx51_twldata,
-	},
-};
-
 /* Audio setup data */
 static struct aic3x_setup_data rx51_aic34_setup = {
 	.gpio_func[0] = AIC3X_GPIO1_FUNC_DISABLED,
@@ -833,8 +823,7 @@ static int __init rx51_i2c_init(void)
 		rx51_twldata.vaux3 = &rx51_vaux3_cam;
 	}
 	rx51_twldata.vmmc2 = &rx51_vmmc2;
-	omap_register_i2c_bus(1, 2200, rx51_peripherals_i2c_board_info_1,
-			      ARRAY_SIZE(rx51_peripherals_i2c_board_info_1));
+	omap_pmic_init(1, 2200, "twl5030", INT_34XX_SYS_NIRQ, &rx51_twldata);
 	omap_register_i2c_bus(2, 100, rx51_peripherals_i2c_board_info_2,
 			      ARRAY_SIZE(rx51_peripherals_i2c_board_info_2));
 	omap_register_i2c_bus(3, 400, NULL, 0);
@@ -921,26 +910,20 @@ static void rx51_wl1251_set_power(bool enable)
 	gpio_set_value(RX51_WL1251_POWER_GPIO, enable);
 }
 
+static struct gpio rx51_wl1251_gpios[] __initdata = {
+	{ RX51_WL1251_POWER_GPIO, GPIOF_OUT_INIT_LOW,	"wl1251 power"	},
+	{ RX51_WL1251_IRQ_GPIO,	  GPIOF_IN,		"wl1251 irq"	},
+};
+
 static void __init rx51_init_wl1251(void)
 {
 	int irq, ret;
 
-	ret = gpio_request(RX51_WL1251_POWER_GPIO, "wl1251 power");
+	ret = gpio_request_array(rx51_wl1251_gpios,
+				 ARRAY_SIZE(rx51_wl1251_gpios));
 	if (ret < 0)
 		goto error;
 
-	ret = gpio_direction_output(RX51_WL1251_POWER_GPIO, 0);
-	if (ret < 0)
-		goto err_power;
-
-	ret = gpio_request(RX51_WL1251_IRQ_GPIO, "wl1251 irq");
-	if (ret < 0)
-		goto err_power;
-
-	ret = gpio_direction_input(RX51_WL1251_IRQ_GPIO);
-	if (ret < 0)
-		goto err_irq;
-
 	irq = gpio_to_irq(RX51_WL1251_IRQ_GPIO);
 	if (irq < 0)
 		goto err_irq;
@@ -952,10 +935,7 @@ static void __init rx51_init_wl1251(void)
 
 err_irq:
 	gpio_free(RX51_WL1251_IRQ_GPIO);
-
-err_power:
 	gpio_free(RX51_WL1251_POWER_GPIO);
-
 error:
 	printk(KERN_ERR "wl1251 board initialisation failed\n");
 	wl1251_pdata.set_power = NULL;
diff --git a/arch/arm/mach-omap2/board-rx51-video.c b/arch/arm/mach-omap2/board-rx51-video.c
index 2df10b6..2c1289b 100644
--- a/arch/arm/mach-omap2/board-rx51-video.c
+++ b/arch/arm/mach-omap2/board-rx51-video.c
@@ -76,13 +76,12 @@ static int __init rx51_video_init(void)
 		return 0;
 	}
 
-	if (gpio_request(RX51_LCD_RESET_GPIO, "LCD ACX565AKM reset")) {
+	if (gpio_request_one(RX51_LCD_RESET_GPIO, GPIOF_OUT_INIT_HIGH,
+			     "LCD ACX565AKM reset")) {
 		pr_err("%s failed to get LCD Reset GPIO\n", __func__);
 		return 0;
 	}
 
-	gpio_direction_output(RX51_LCD_RESET_GPIO, 1);
-
 	omap_display_init(&rx51_dss_board_info);
 	return 0;
 }
diff --git a/arch/arm/mach-omap2/board-rx51.c b/arch/arm/mach-omap2/board-rx51.c
index f8ba20a..fec4cac 100644
--- a/arch/arm/mach-omap2/board-rx51.c
+++ b/arch/arm/mach-omap2/board-rx51.c
@@ -58,21 +58,25 @@ static struct platform_device leds_gpio = {
 	},
 };
 
+/*
+ * cpuidle C-states definition override from the default values.
+ * The 'exit_latency' field is the sum of sleep and wake-up latencies.
+ */
 static struct cpuidle_params rx51_cpuidle_params[] = {
 	/* C1 */
-	{1, 110, 162, 5},
+	{110 + 162, 5 , 1},
 	/* C2 */
-	{1, 106, 180, 309},
+	{106 + 180, 309, 1},
 	/* C3 */
-	{0, 107, 410, 46057},
+	{107 + 410, 46057, 0},
 	/* C4 */
-	{0, 121, 3374, 46057},
+	{121 + 3374, 46057, 0},
 	/* C5 */
-	{1, 855, 1146, 46057},
+	{855 + 1146, 46057, 1},
 	/* C6 */
-	{0, 7580, 4134, 484329},
+	{7580 + 4134, 484329, 0},
 	/* C7 */
-	{1, 7505, 15274, 484329},
+	{7505 + 15274, 484329, 1},
 };
 
 static struct omap_lcd_config rx51_lcd_config = {
diff --git a/arch/arm/mach-omap2/board-zoom-debugboard.c b/arch/arm/mach-omap2/board-zoom-debugboard.c
index 007ebdc..6402e78 100644
--- a/arch/arm/mach-omap2/board-zoom-debugboard.c
+++ b/arch/arm/mach-omap2/board-zoom-debugboard.c
@@ -15,6 +15,7 @@
 #include <linux/interrupt.h>
 
 #include <plat/gpmc.h>
+#include <plat/gpmc-smsc911x.h>
 
 #include <mach/board-zoom.h>
 
@@ -26,60 +27,16 @@
 #define DEBUG_BASE		0x08000000
 #define ZOOM_ETHR_START	DEBUG_BASE
 
-static struct resource zoom_smsc911x_resources[] = {
-	[0] = {
-		.start	= ZOOM_ETHR_START,
-		.end	= ZOOM_ETHR_START + SZ_4K,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.flags	= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
-	},
-};
-
-static struct smsc911x_platform_config zoom_smsc911x_config = {
-	.irq_polarity	= SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
-	.irq_type	= SMSC911X_IRQ_TYPE_OPEN_DRAIN,
+static struct omap_smsc911x_platform_data zoom_smsc911x_cfg = {
+	.cs             = ZOOM_SMSC911X_CS,
+	.gpio_irq       = ZOOM_SMSC911X_GPIO,
+	.gpio_reset     = -EINVAL,
 	.flags		= SMSC911X_USE_32BIT,
-	.phy_interface	= PHY_INTERFACE_MODE_MII,
-};
-
-static struct platform_device zoom_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(zoom_smsc911x_resources),
-	.resource	= zoom_smsc911x_resources,
-	.dev		= {
-		.platform_data = &zoom_smsc911x_config,
-	},
 };
 
 static inline void __init zoom_init_smsc911x(void)
 {
-	int eth_cs;
-	unsigned long cs_mem_base;
-	int eth_gpio = 0;
-
-	eth_cs = ZOOM_SMSC911X_CS;
-
-	if (gpmc_cs_request(eth_cs, SZ_16M, &cs_mem_base) < 0) {
-		printk(KERN_ERR "Failed to request GPMC mem for smsc911x\n");
-		return;
-	}
-
-	zoom_smsc911x_resources[0].start = cs_mem_base + 0x0;
-	zoom_smsc911x_resources[0].end   = cs_mem_base + 0xff;
-
-	eth_gpio = ZOOM_SMSC911X_GPIO;
-
-	zoom_smsc911x_resources[1].start = OMAP_GPIO_IRQ(eth_gpio);
-
-	if (gpio_request(eth_gpio, "smsc911x irq") < 0) {
-		printk(KERN_ERR "Failed to request GPIO%d for smsc911x IRQ\n",
-				eth_gpio);
-		return;
-	}
-	gpio_direction_input(eth_gpio);
+	gpmc_smsc911x_init(&zoom_smsc911x_cfg);
 }
 
 static struct plat_serial8250_port serial_platform_data[] = {
@@ -120,12 +77,9 @@ static inline void __init zoom_init_quaduart(void)
 
 	quart_gpio = ZOOM_QUADUART_GPIO;
 
-	if (gpio_request(quart_gpio, "TL16CP754C GPIO") < 0) {
+	if (gpio_request_one(quart_gpio, GPIOF_IN, "TL16CP754C GPIO") < 0)
 		printk(KERN_ERR "Failed to request GPIO%d for TL16CP754C\n",
 								quart_gpio);
-		return;
-	}
-	gpio_direction_input(quart_gpio);
 }
 
 static inline int omap_zoom_debugboard_detect(void)
@@ -135,12 +89,12 @@ static inline int omap_zoom_debugboard_detect(void)
 
 	debug_board_detect = ZOOM_SMSC911X_GPIO;
 
-	if (gpio_request(debug_board_detect, "Zoom debug board detect") < 0) {
+	if (gpio_request_one(debug_board_detect, GPIOF_IN,
+			     "Zoom debug board detect") < 0) {
 		printk(KERN_ERR "Failed to request GPIO%d for Zoom debug"
 		"board detect\n", debug_board_detect);
 		return 0;
 	}
-	gpio_direction_input(debug_board_detect);
 
 	if (!gpio_get_value(debug_board_detect)) {
 		ret = 0;
@@ -150,7 +104,6 @@ static inline int omap_zoom_debugboard_detect(void)
 }
 
 static struct platform_device *zoom_devices[] __initdata = {
-	&zoom_smsc911x_device,
 	&zoom_debugboard_serial_device,
 };
 
diff --git a/arch/arm/mach-omap2/board-zoom-display.c b/arch/arm/mach-omap2/board-zoom-display.c
index 60e8645..c7c6beb 100644
--- a/arch/arm/mach-omap2/board-zoom-display.c
+++ b/arch/arm/mach-omap2/board-zoom-display.c
@@ -21,34 +21,19 @@
 #define LCD_PANEL_RESET_GPIO_PILOT	55
 #define LCD_PANEL_QVGA_GPIO		56
 
+static struct gpio zoom_lcd_gpios[] __initdata = {
+	{ -EINVAL,		GPIOF_OUT_INIT_HIGH, "lcd reset" },
+	{ LCD_PANEL_QVGA_GPIO,	GPIOF_OUT_INIT_HIGH, "lcd qvga"	 },
+};
+
 static void zoom_lcd_panel_init(void)
 {
-	int ret;
-	unsigned char lcd_panel_reset_gpio;
-
-	lcd_panel_reset_gpio = (omap_rev() > OMAP3430_REV_ES3_0) ?
+	zoom_lcd_gpios[0].gpio = (omap_rev() > OMAP3430_REV_ES3_0) ?
 			LCD_PANEL_RESET_GPIO_PROD :
 			LCD_PANEL_RESET_GPIO_PILOT;
 
-	ret = gpio_request(lcd_panel_reset_gpio, "lcd reset");
-	if (ret) {
-		pr_err("Failed to get LCD reset GPIO (gpio%d).\n",
-			lcd_panel_reset_gpio);
-		return;
-	}
-	gpio_direction_output(lcd_panel_reset_gpio, 1);
-
-	ret = gpio_request(LCD_PANEL_QVGA_GPIO, "lcd qvga");
-	if (ret) {
-		pr_err("Failed to get LCD_PANEL_QVGA_GPIO (gpio%d).\n",
-			LCD_PANEL_QVGA_GPIO);
-		goto err0;
-	}
-	gpio_direction_output(LCD_PANEL_QVGA_GPIO, 1);
-
-	return;
-err0:
-	gpio_free(lcd_panel_reset_gpio);
+	if (gpio_request_array(zoom_lcd_gpios, ARRAY_SIZE(zoom_lcd_gpios)))
+		pr_err("%s: Failed to get LCD GPIOs.\n", __func__);
 }
 
 static int zoom_panel_enable_lcd(struct omap_dss_device *dssdev)
diff --git a/arch/arm/mach-omap2/board-zoom-peripherals.c b/arch/arm/mach-omap2/board-zoom-peripherals.c
index 8dee754..118c6f5 100644
--- a/arch/arm/mach-omap2/board-zoom-peripherals.c
+++ b/arch/arm/mach-omap2/board-zoom-peripherals.c
@@ -31,6 +31,7 @@
 
 #include "mux.h"
 #include "hsmmc.h"
+#include "common-board-devices.h"
 
 #define OMAP_ZOOM_WLAN_PMENA_GPIO	(101)
 #define OMAP_ZOOM_WLAN_IRQ_GPIO		(162)
@@ -276,13 +277,11 @@ static int zoom_twl_gpio_setup(struct device *dev,
 	zoom_vsim_supply.dev = mmc[0].dev;
 	zoom_vmmc2_supply.dev = mmc[1].dev;
 
-	ret = gpio_request(LCD_PANEL_ENABLE_GPIO, "lcd enable");
-	if (ret) {
+	ret = gpio_request_one(LCD_PANEL_ENABLE_GPIO, GPIOF_OUT_INIT_LOW,
+			       "lcd enable");
+	if (ret)
 		pr_err("Failed to get LCD_PANEL_ENABLE_GPIO (gpio%d).\n",
 				LCD_PANEL_ENABLE_GPIO);
-		return ret;
-	}
-	gpio_direction_output(LCD_PANEL_ENABLE_GPIO, 0);
 
 	return ret;
 }
@@ -349,15 +348,6 @@ static struct twl4030_platform_data zoom_twldata = {
 	.vdac		= &zoom_vdac,
 };
 
-static struct i2c_board_info __initdata zoom_i2c_boardinfo[] = {
-	{
-		I2C_BOARD_INFO("twl5030", 0x48),
-		.flags		= I2C_CLIENT_WAKE,
-		.irq		= INT_34XX_SYS_NIRQ,
-		.platform_data	= &zoom_twldata,
-	},
-};
-
 static int __init omap_i2c_init(void)
 {
 	if (machine_is_omap_zoom2()) {
@@ -365,19 +355,12 @@ static int __init omap_i2c_init(void)
 		zoom_audio_data.hs_extmute = 1;
 		zoom_audio_data.set_hs_extmute = zoom2_set_hs_extmute;
 	}
-	omap_register_i2c_bus(1, 2400, zoom_i2c_boardinfo,
-			ARRAY_SIZE(zoom_i2c_boardinfo));
+	omap_pmic_init(1, 2400, "twl5030", INT_34XX_SYS_NIRQ, &zoom_twldata);
 	omap_register_i2c_bus(2, 400, NULL, 0);
 	omap_register_i2c_bus(3, 400, NULL, 0);
 	return 0;
 }
 
-static struct omap_musb_board_data musb_board_data = {
-	.interface_type		= MUSB_INTERFACE_ULPI,
-	.mode			= MUSB_OTG,
-	.power			= 100,
-};
-
 static void enable_board_wakeup_source(void)
 {
 	/* T2 interrupt line (keypad) */
@@ -392,7 +375,7 @@ void __init zoom_peripherals_init(void)
 
 	omap_i2c_init();
 	platform_device_register(&omap_vwlan_device);
-	usb_musb_init(&musb_board_data);
+	usb_musb_init(NULL);
 	enable_board_wakeup_source();
 	omap_serial_init();
 }
diff --git a/arch/arm/mach-omap2/common-board-devices.c b/arch/arm/mach-omap2/common-board-devices.c
new file mode 100644
index 0000000..e94903b
--- /dev/null
+++ b/arch/arm/mach-omap2/common-board-devices.c
@@ -0,0 +1,163 @@
+/*
+ * common-board-devices.c
+ *
+ * Copyright (C) 2011 CompuLab, Ltd.
+ * Author: Mike Rapoport <mike@compulab.co.il>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c/twl.h>
+
+#include <linux/gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/ads7846.h>
+
+#include <plat/i2c.h>
+#include <plat/mcspi.h>
+#include <plat/nand.h>
+
+#include "common-board-devices.h"
+
+static struct i2c_board_info __initdata pmic_i2c_board_info = {
+	.addr		= 0x48,
+	.flags		= I2C_CLIENT_WAKE,
+};
+
+void __init omap_pmic_init(int bus, u32 clkrate,
+			   const char *pmic_type, int pmic_irq,
+			   struct twl4030_platform_data *pmic_data)
+{
+	strncpy(pmic_i2c_board_info.type, pmic_type,
+		sizeof(pmic_i2c_board_info.type));
+	pmic_i2c_board_info.irq = pmic_irq;
+	pmic_i2c_board_info.platform_data = pmic_data;
+
+	omap_register_i2c_bus(bus, clkrate, &pmic_i2c_board_info, 1);
+}
+
+#if defined(CONFIG_TOUCHSCREEN_ADS7846) || \
+	defined(CONFIG_TOUCHSCREEN_ADS7846_MODULE)
+static struct omap2_mcspi_device_config ads7846_mcspi_config = {
+	.turbo_mode	= 0,
+	.single_channel	= 1,	/* 0: slave, 1: master */
+};
+
+static struct ads7846_platform_data ads7846_config = {
+	.x_max			= 0x0fff,
+	.y_max			= 0x0fff,
+	.x_plate_ohms		= 180,
+	.pressure_max		= 255,
+	.debounce_max		= 10,
+	.debounce_tol		= 3,
+	.debounce_rep		= 1,
+	.gpio_pendown		= -EINVAL,
+	.keep_vref_on		= 1,
+};
+
+static struct spi_board_info ads7846_spi_board_info __initdata = {
+	.modalias		= "ads7846",
+	.bus_num		= -EINVAL,
+	.chip_select		= 0,
+	.max_speed_hz		= 1500000,
+	.controller_data	= &ads7846_mcspi_config,
+	.irq			= -EINVAL,
+	.platform_data		= &ads7846_config,
+};
+
+void __init omap_ads7846_init(int bus_num, int gpio_pendown, int gpio_debounce,
+			      struct ads7846_platform_data *board_pdata)
+{
+	struct spi_board_info *spi_bi = &ads7846_spi_board_info;
+	int err;
+
+	err = gpio_request(gpio_pendown, "TS PenDown");
+	if (err) {
+		pr_err("Could not obtain gpio for TS PenDown: %d\n", err);
+		return;
+	}
+
+	gpio_direction_input(gpio_pendown);
+	gpio_export(gpio_pendown, 0);
+
+	if (gpio_debounce)
+		gpio_set_debounce(gpio_pendown, gpio_debounce);
+
+	ads7846_config.gpio_pendown = gpio_pendown;
+
+	spi_bi->bus_num	= bus_num;
+	spi_bi->irq	= OMAP_GPIO_IRQ(gpio_pendown);
+
+	if (board_pdata)
+		spi_bi->platform_data = board_pdata;
+
+	spi_register_board_info(&ads7846_spi_board_info, 1);
+}
+#else
+void __init omap_ads7846_init(int bus_num, int gpio_pendown, int gpio_debounce,
+			      struct ads7846_platform_data *board_pdata)
+{
+}
+#endif
+
+#if defined(CONFIG_MTD_NAND_OMAP2) || defined(CONFIG_MTD_NAND_OMAP2_MODULE)
+static struct omap_nand_platform_data nand_data = {
+	.dma_channel	= -1,		/* disable DMA in OMAP NAND driver */
+};
+
+void __init omap_nand_flash_init(int options, struct mtd_partition *parts,
+				 int nr_parts)
+{
+	u8 cs = 0;
+	u8 nandcs = GPMC_CS_NUM + 1;
+
+	/* find out the chip-select on which NAND exists */
+	while (cs < GPMC_CS_NUM) {
+		u32 ret = 0;
+		ret = gpmc_cs_read_reg(cs, GPMC_CS_CONFIG1);
+
+		if ((ret & 0xC00) == 0x800) {
+			printk(KERN_INFO "Found NAND on CS%d\n", cs);
+			if (nandcs > GPMC_CS_NUM)
+				nandcs = cs;
+		}
+		cs++;
+	}
+
+	if (nandcs > GPMC_CS_NUM) {
+		printk(KERN_INFO "NAND: Unable to find configuration "
+				 "in GPMC\n ");
+		return;
+	}
+
+	if (nandcs < GPMC_CS_NUM) {
+		nand_data.cs = nandcs;
+		nand_data.parts = parts;
+		nand_data.nr_parts = nr_parts;
+		nand_data.options = options;
+
+		printk(KERN_INFO "Registering NAND on CS%d\n", nandcs);
+		if (gpmc_nand_init(&nand_data) < 0)
+			printk(KERN_ERR "Unable to register NAND device\n");
+	}
+}
+#else
+void __init omap_nand_flash_init(int options, struct mtd_partition *parts,
+				 int nr_parts)
+{
+}
+#endif
diff --git a/arch/arm/mach-omap2/common-board-devices.h b/arch/arm/mach-omap2/common-board-devices.h
new file mode 100644
index 0000000..eb80b3b
--- /dev/null
+++ b/arch/arm/mach-omap2/common-board-devices.h
@@ -0,0 +1,35 @@
+#ifndef __OMAP_COMMON_BOARD_DEVICES__
+#define __OMAP_COMMON_BOARD_DEVICES__
+
+struct twl4030_platform_data;
+struct mtd_partition;
+
+void omap_pmic_init(int bus, u32 clkrate, const char *pmic_type, int pmic_irq,
+		    struct twl4030_platform_data *pmic_data);
+
+static inline void omap2_pmic_init(const char *pmic_type,
+				   struct twl4030_platform_data *pmic_data)
+{
+	omap_pmic_init(2, 2600, pmic_type, INT_24XX_SYS_NIRQ, pmic_data);
+}
+
+static inline void omap3_pmic_init(const char *pmic_type,
+				   struct twl4030_platform_data *pmic_data)
+{
+	omap_pmic_init(1, 2600, pmic_type, INT_34XX_SYS_NIRQ, pmic_data);
+}
+
+static inline void omap4_pmic_init(const char *pmic_type,
+				   struct twl4030_platform_data *pmic_data)
+{
+	/* Phoenix Audio IC needs I2C1 to start with 400 KHz or less */
+	omap_pmic_init(1, 400, pmic_type, OMAP44XX_IRQ_SYS_1N, pmic_data);
+}
+
+struct ads7846_platform_data;
+
+void omap_ads7846_init(int bus_num, int gpio_pendown, int gpio_debounce,
+		       struct ads7846_platform_data *board_pdata);
+void omap_nand_flash_init(int opts, struct mtd_partition *parts, int n_parts);
+
+#endif /* __OMAP_COMMON_BOARD_DEVICES__ */
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 1c240ef..4bf6e6e 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -36,36 +36,6 @@
 
 #ifdef CONFIG_CPU_IDLE
 
-#define OMAP3_MAX_STATES 7
-#define OMAP3_STATE_C1 0 /* C1 - MPU WFI + Core active */
-#define OMAP3_STATE_C2 1 /* C2 - MPU WFI + Core inactive */
-#define OMAP3_STATE_C3 2 /* C3 - MPU CSWR + Core inactive */
-#define OMAP3_STATE_C4 3 /* C4 - MPU OFF + Core iactive */
-#define OMAP3_STATE_C5 4 /* C5 - MPU RET + Core RET */
-#define OMAP3_STATE_C6 5 /* C6 - MPU OFF + Core RET */
-#define OMAP3_STATE_C7 6 /* C7 - MPU OFF + Core OFF */
-
-#define OMAP3_STATE_MAX OMAP3_STATE_C7
-
-#define CPUIDLE_FLAG_CHECK_BM	0x10000	/* use omap3_enter_idle_bm() */
-
-struct omap3_processor_cx {
-	u8 valid;
-	u8 type;
-	u32 sleep_latency;
-	u32 wakeup_latency;
-	u32 mpu_state;
-	u32 core_state;
-	u32 threshold;
-	u32 flags;
-	const char *desc;
-};
-
-struct omap3_processor_cx omap3_power_states[OMAP3_MAX_STATES];
-struct omap3_processor_cx current_cx_state;
-struct powerdomain *mpu_pd, *core_pd, *per_pd;
-struct powerdomain *cam_pd;
-
 /*
  * The latencies/thresholds for various C states have
  * to be configured from the respective board files.
@@ -75,27 +45,31 @@ struct powerdomain *cam_pd;
  */
 static struct cpuidle_params cpuidle_params_table[] = {
 	/* C1 */
-	{1, 2, 2, 5},
+	{2 + 2, 5, 1},
 	/* C2 */
-	{1, 10, 10, 30},
+	{10 + 10, 30, 1},
 	/* C3 */
-	{1, 50, 50, 300},
+	{50 + 50, 300, 1},
 	/* C4 */
-	{1, 1500, 1800, 4000},
+	{1500 + 1800, 4000, 1},
 	/* C5 */
-	{1, 2500, 7500, 12000},
+	{2500 + 7500, 12000, 1},
 	/* C6 */
-	{1, 3000, 8500, 15000},
+	{3000 + 8500, 15000, 1},
 	/* C7 */
-	{1, 10000, 30000, 300000},
+	{10000 + 30000, 300000, 1},
 };
+#define OMAP3_NUM_STATES ARRAY_SIZE(cpuidle_params_table)
 
-static int omap3_idle_bm_check(void)
-{
-	if (!omap3_can_sleep())
-		return 1;
-	return 0;
-}
+/* Mach specific information to be recorded in the C-state driver_data */
+struct omap3_idle_statedata {
+	u32 mpu_state;
+	u32 core_state;
+	u8 valid;
+};
+struct omap3_idle_statedata omap3_idle_data[OMAP3_NUM_STATES];
+
+struct powerdomain *mpu_pd, *core_pd, *per_pd, *cam_pd;
 
 static int _cpuidle_allow_idle(struct powerdomain *pwrdm,
 				struct clockdomain *clkdm)
@@ -122,12 +96,10 @@ static int _cpuidle_deny_idle(struct powerdomain *pwrdm,
 static int omap3_enter_idle(struct cpuidle_device *dev,
 			struct cpuidle_state *state)
 {
-	struct omap3_processor_cx *cx = cpuidle_get_statedata(state);
+	struct omap3_idle_statedata *cx = cpuidle_get_statedata(state);
 	struct timespec ts_preidle, ts_postidle, ts_idle;
 	u32 mpu_state = cx->mpu_state, core_state = cx->core_state;
 
-	current_cx_state = *cx;
-
 	/* Used to keep track of the total time in idle */
 	getnstimeofday(&ts_preidle);
 
@@ -140,7 +112,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 	if (omap_irq_pending() || need_resched())
 		goto return_sleep_time;
 
-	if (cx->type == OMAP3_STATE_C1) {
+	/* Deny idle for C1 */
+	if (state == &dev->states[0]) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_deny_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_deny_idle);
 	}
@@ -148,7 +121,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 	/* Execute ARM wfi */
 	omap_sram_idle();
 
-	if (cx->type == OMAP3_STATE_C1) {
+	/* Re-allow idle for C1 */
+	if (state == &dev->states[0]) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_allow_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_allow_idle);
 	}
@@ -164,41 +138,53 @@ return_sleep_time:
 }
 
 /**
- * next_valid_state - Find next valid c-state
+ * next_valid_state - Find next valid C-state
  * @dev: cpuidle device
- * @state: Currently selected c-state
+ * @state: Currently selected C-state
  *
  * If the current state is valid, it is returned back to the caller.
  * Else, this function searches for a lower c-state which is still
- * valid (as defined in omap3_power_states[]).
+ * valid.
+ *
+ * A state is valid if the 'valid' field is enabled and
+ * if it satisfies the enable_off_mode condition.
  */
 static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
-						struct cpuidle_state *curr)
+					      struct cpuidle_state *curr)
 {
 	struct cpuidle_state *next = NULL;
-	struct omap3_processor_cx *cx;
+	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr);
+	u32 mpu_deepest_state = PWRDM_POWER_RET;
+	u32 core_deepest_state = PWRDM_POWER_RET;
 
-	cx = (struct omap3_processor_cx *)cpuidle_get_statedata(curr);
+	if (enable_off_mode) {
+		mpu_deepest_state = PWRDM_POWER_OFF;
+		/*
+		 * Erratum i583: valable for ES rev < Es1.2 on 3630.
+		 * CORE OFF mode is not supported in a stable form, restrict
+		 * instead the CORE state to RET.
+		 */
+		if (!IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583))
+			core_deepest_state = PWRDM_POWER_OFF;
+	}
 
 	/* Check if current state is valid */
-	if (cx->valid) {
+	if ((cx->valid) &&
+	    (cx->mpu_state >= mpu_deepest_state) &&
+	    (cx->core_state >= core_deepest_state)) {
 		return curr;
 	} else {
-		u8 idx = OMAP3_STATE_MAX;
+		int idx = OMAP3_NUM_STATES - 1;
 
-		/*
-		 * Reach the current state starting at highest C-state
-		 */
-		for (; idx >= OMAP3_STATE_C1; idx--) {
+		/* Reach the current state starting at highest C-state */
+		for (; idx >= 0; idx--) {
 			if (&dev->states[idx] == curr) {
 				next = &dev->states[idx];
 				break;
 			}
 		}
 
-		/*
-		 * Should never hit this condition.
-		 */
+		/* Should never hit this condition */
 		WARN_ON(next == NULL);
 
 		/*
@@ -206,17 +192,17 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 		 * Start search from the next (lower) state.
 		 */
 		idx--;
-		for (; idx >= OMAP3_STATE_C1; idx--) {
-			struct omap3_processor_cx *cx;
-
+		for (; idx >= 0; idx--) {
 			cx = cpuidle_get_statedata(&dev->states[idx]);
-			if (cx->valid) {
+			if ((cx->valid) &&
+			    (cx->mpu_state >= mpu_deepest_state) &&
+			    (cx->core_state >= core_deepest_state)) {
 				next = &dev->states[idx];
 				break;
 			}
 		}
 		/*
-		 * C1 and C2 are always valid.
+		 * C1 is always valid.
 		 * So, no need to check for 'next==NULL' outside this loop.
 		 */
 	}
@@ -229,36 +215,22 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
  * @dev: cpuidle device
  * @state: The target state to be programmed
  *
- * Used for C states with CPUIDLE_FLAG_CHECK_BM flag set. This
- * function checks for any pending activity and then programs the
- * device to the specified or a safer state.
+ * This function checks for any pending activity and then programs
+ * the device to the specified or a safer state.
  */
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 			       struct cpuidle_state *state)
 {
-	struct cpuidle_state *new_state = next_valid_state(dev, state);
-	u32 core_next_state, per_next_state = 0, per_saved_state = 0;
-	u32 cam_state;
-	struct omap3_processor_cx *cx;
+	struct cpuidle_state *new_state;
+	u32 core_next_state, per_next_state = 0, per_saved_state = 0, cam_state;
+	struct omap3_idle_statedata *cx;
 	int ret;
 
-	if ((state->flags & CPUIDLE_FLAG_CHECK_BM) && omap3_idle_bm_check()) {
-		BUG_ON(!dev->safe_state);
+	if (!omap3_can_sleep()) {
 		new_state = dev->safe_state;
 		goto select_state;
 	}
 
-	cx = cpuidle_get_statedata(state);
-	core_next_state = cx->core_state;
-
-	/*
-	 * FIXME: we currently manage device-specific idle states
-	 *        for PER and CORE in combination with CPU-specific
-	 *        idle states.  This is wrong, and device-specific
-	 *        idle management needs to be separated out into 
-	 *        its own code.
-	 */
-
 	/*
 	 * Prevent idle completely if CAM is active.
 	 * CAM does not have wakeup capability in OMAP3.
@@ -270,9 +242,19 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	}
 
 	/*
+	 * FIXME: we currently manage device-specific idle states
+	 *        for PER and CORE in combination with CPU-specific
+	 *        idle states.  This is wrong, and device-specific
+	 *        idle management needs to be separated out into
+	 *        its own code.
+	 */
+
+	/*
 	 * Prevent PER off if CORE is not in retention or off as this
 	 * would disable PER wakeups completely.
 	 */
+	cx = cpuidle_get_statedata(state);
+	core_next_state = cx->core_state;
 	per_next_state = per_saved_state = pwrdm_read_next_pwrst(per_pd);
 	if ((per_next_state == PWRDM_POWER_OFF) &&
 	    (core_next_state > PWRDM_POWER_RET))
@@ -282,6 +264,8 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	if (per_next_state != per_saved_state)
 		pwrdm_set_next_pwrst(per_pd, per_next_state);
 
+	new_state = next_valid_state(dev, state);
+
 select_state:
 	dev->last_state = new_state;
 	ret = omap3_enter_idle(dev, new_state);
@@ -295,31 +279,6 @@ select_state:
 
 DEFINE_PER_CPU(struct cpuidle_device, omap3_idle_dev);
 
-/**
- * omap3_cpuidle_update_states() - Update the cpuidle states
- * @mpu_deepest_state:	Enable states up to and including this for mpu domain
- * @core_deepest_state:	Enable states up to and including this for core domain
- *
- * This goes through the list of states available and enables and disables the
- * validity of C states based on deepest state that can be achieved for the
- * variable domain
- */
-void omap3_cpuidle_update_states(u32 mpu_deepest_state, u32 core_deepest_state)
-{
-	int i;
-
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		struct omap3_processor_cx *cx = &omap3_power_states[i];
-
-		if ((cx->mpu_state >= mpu_deepest_state) &&
-		    (cx->core_state >= core_deepest_state)) {
-			cx->valid = 1;
-		} else {
-			cx->valid = 0;
-		}
-	}
-}
-
 void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 {
 	int i;
@@ -327,212 +286,109 @@ void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 	if (!cpuidle_board_params)
 		return;
 
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		cpuidle_params_table[i].valid =
-			cpuidle_board_params[i].valid;
-		cpuidle_params_table[i].sleep_latency =
-			cpuidle_board_params[i].sleep_latency;
-		cpuidle_params_table[i].wake_latency =
-			cpuidle_board_params[i].wake_latency;
-		cpuidle_params_table[i].threshold =
-			cpuidle_board_params[i].threshold;
+	for (i = 0; i < OMAP3_NUM_STATES; i++) {
+		cpuidle_params_table[i].valid =	cpuidle_board_params[i].valid;
+		cpuidle_params_table[i].exit_latency =
+			cpuidle_board_params[i].exit_latency;
+		cpuidle_params_table[i].target_residency =
+			cpuidle_board_params[i].target_residency;
 	}
 	return;
 }
 
-/* omap3_init_power_states - Initialises the OMAP3 specific C states.
- *
- * Below is the desciption of each C state.
- * 	C1 . MPU WFI + Core active
- *	C2 . MPU WFI + Core inactive
- *	C3 . MPU CSWR + Core inactive
- *	C4 . MPU OFF + Core inactive
- *	C5 . MPU CSWR + Core CSWR
- *	C6 . MPU OFF + Core CSWR
- *	C7 . MPU OFF + Core OFF
- */
-void omap_init_power_states(void)
-{
-	/* C1 . MPU WFI + Core active */
-	omap3_power_states[OMAP3_STATE_C1].valid =
-			cpuidle_params_table[OMAP3_STATE_C1].valid;
-	omap3_power_states[OMAP3_STATE_C1].type = OMAP3_STATE_C1;
-	omap3_power_states[OMAP3_STATE_C1].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C1].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C1].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C1].wake_latency;
-	omap3_power_states[OMAP3_STATE_C1].threshold =
-			cpuidle_params_table[OMAP3_STATE_C1].threshold;
-	omap3_power_states[OMAP3_STATE_C1].mpu_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C1].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C1].flags = CPUIDLE_FLAG_TIME_VALID;
-	omap3_power_states[OMAP3_STATE_C1].desc = "MPU ON + CORE ON";
-
-	/* C2 . MPU WFI + Core inactive */
-	omap3_power_states[OMAP3_STATE_C2].valid =
-			cpuidle_params_table[OMAP3_STATE_C2].valid;
-	omap3_power_states[OMAP3_STATE_C2].type = OMAP3_STATE_C2;
-	omap3_power_states[OMAP3_STATE_C2].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C2].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C2].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C2].wake_latency;
-	omap3_power_states[OMAP3_STATE_C2].threshold =
-			cpuidle_params_table[OMAP3_STATE_C2].threshold;
-	omap3_power_states[OMAP3_STATE_C2].mpu_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C2].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C2].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C2].desc = "MPU ON + CORE ON";
-
-	/* C3 . MPU CSWR + Core inactive */
-	omap3_power_states[OMAP3_STATE_C3].valid =
-			cpuidle_params_table[OMAP3_STATE_C3].valid;
-	omap3_power_states[OMAP3_STATE_C3].type = OMAP3_STATE_C3;
-	omap3_power_states[OMAP3_STATE_C3].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C3].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C3].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C3].wake_latency;
-	omap3_power_states[OMAP3_STATE_C3].threshold =
-			cpuidle_params_table[OMAP3_STATE_C3].threshold;
-	omap3_power_states[OMAP3_STATE_C3].mpu_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C3].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C3].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C3].desc = "MPU RET + CORE ON";
-
-	/* C4 . MPU OFF + Core inactive */
-	omap3_power_states[OMAP3_STATE_C4].valid =
-			cpuidle_params_table[OMAP3_STATE_C4].valid;
-	omap3_power_states[OMAP3_STATE_C4].type = OMAP3_STATE_C4;
-	omap3_power_states[OMAP3_STATE_C4].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C4].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C4].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C4].wake_latency;
-	omap3_power_states[OMAP3_STATE_C4].threshold =
-			cpuidle_params_table[OMAP3_STATE_C4].threshold;
-	omap3_power_states[OMAP3_STATE_C4].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C4].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C4].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C4].desc = "MPU OFF + CORE ON";
-
-	/* C5 . MPU CSWR + Core CSWR*/
-	omap3_power_states[OMAP3_STATE_C5].valid =
-			cpuidle_params_table[OMAP3_STATE_C5].valid;
-	omap3_power_states[OMAP3_STATE_C5].type = OMAP3_STATE_C5;
-	omap3_power_states[OMAP3_STATE_C5].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C5].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C5].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C5].wake_latency;
-	omap3_power_states[OMAP3_STATE_C5].threshold =
-			cpuidle_params_table[OMAP3_STATE_C5].threshold;
-	omap3_power_states[OMAP3_STATE_C5].mpu_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C5].core_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C5].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C5].desc = "MPU RET + CORE RET";
-
-	/* C6 . MPU OFF + Core CSWR */
-	omap3_power_states[OMAP3_STATE_C6].valid =
-			cpuidle_params_table[OMAP3_STATE_C6].valid;
-	omap3_power_states[OMAP3_STATE_C6].type = OMAP3_STATE_C6;
-	omap3_power_states[OMAP3_STATE_C6].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C6].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C6].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C6].wake_latency;
-	omap3_power_states[OMAP3_STATE_C6].threshold =
-			cpuidle_params_table[OMAP3_STATE_C6].threshold;
-	omap3_power_states[OMAP3_STATE_C6].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C6].core_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C6].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C6].desc = "MPU OFF + CORE RET";
-
-	/* C7 . MPU OFF + Core OFF */
-	omap3_power_states[OMAP3_STATE_C7].valid =
-			cpuidle_params_table[OMAP3_STATE_C7].valid;
-	omap3_power_states[OMAP3_STATE_C7].type = OMAP3_STATE_C7;
-	omap3_power_states[OMAP3_STATE_C7].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C7].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C7].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C7].wake_latency;
-	omap3_power_states[OMAP3_STATE_C7].threshold =
-			cpuidle_params_table[OMAP3_STATE_C7].threshold;
-	omap3_power_states[OMAP3_STATE_C7].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C7].core_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C7].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C7].desc = "MPU OFF + CORE OFF";
-
-	/*
-	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
-	 * enable OFF mode in a stable form for previous revisions.
-	 * we disable C7 state as a result.
-	 */
-	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) {
-		omap3_power_states[OMAP3_STATE_C7].valid = 0;
-		cpuidle_params_table[OMAP3_STATE_C7].valid = 0;
-		pr_warn("%s: core off state C7 disabled due to i583\n",
-				__func__);
-	}
-}
-
 struct cpuidle_driver omap3_idle_driver = {
 	.name = 	"omap3_idle",
 	.owner = 	THIS_MODULE,
 };
 
+/* Helper to fill the C-state common data and register the driver_data */
+static inline struct omap3_idle_statedata *_fill_cstate(
+					struct cpuidle_device *dev,
+					int idx, const char *descr)
+{
+	struct omap3_idle_statedata *cx = &omap3_idle_data[idx];
+	struct cpuidle_state *state = &dev->states[idx];
+
+	state->exit_latency	= cpuidle_params_table[idx].exit_latency;
+	state->target_residency	= cpuidle_params_table[idx].target_residency;
+	state->flags		= CPUIDLE_FLAG_TIME_VALID;
+	state->enter		= omap3_enter_idle_bm;
+	cx->valid		= cpuidle_params_table[idx].valid;
+	sprintf(state->name, "C%d", idx + 1);
+	strncpy(state->desc, descr, CPUIDLE_DESC_LEN);
+	cpuidle_set_statedata(state, cx);
+
+	return cx;
+}
+
 /**
  * omap3_idle_init - Init routine for OMAP3 idle
  *
- * Registers the OMAP3 specific cpuidle driver with the cpuidle
+ * Registers the OMAP3 specific cpuidle driver to the cpuidle
  * framework with the valid set of states.
  */
 int __init omap3_idle_init(void)
 {
-	int i, count = 0;
-	struct omap3_processor_cx *cx;
-	struct cpuidle_state *state;
 	struct cpuidle_device *dev;
+	struct omap3_idle_statedata *cx;
 
 	mpu_pd = pwrdm_lookup("mpu_pwrdm");
 	core_pd = pwrdm_lookup("core_pwrdm");
 	per_pd = pwrdm_lookup("per_pwrdm");
 	cam_pd = pwrdm_lookup("cam_pwrdm");
 
-	omap_init_power_states();
 	cpuidle_register_driver(&omap3_idle_driver);
-
 	dev = &per_cpu(omap3_idle_dev, smp_processor_id());
 
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		cx = &omap3_power_states[i];
-		state = &dev->states[count];
-
-		if (!cx->valid)
-			continue;
-		cpuidle_set_statedata(state, cx);
-		state->exit_latency = cx->sleep_latency + cx->wakeup_latency;
-		state->target_residency = cx->threshold;
-		state->flags = cx->flags;
-		state->enter = (state->flags & CPUIDLE_FLAG_CHECK_BM) ?
-			omap3_enter_idle_bm : omap3_enter_idle;
-		if (cx->type == OMAP3_STATE_C1)
-			dev->safe_state = state;
-		sprintf(state->name, "C%d", count+1);
-		strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
-		count++;
-	}
+	/* C1 . MPU WFI + Core active */
+	cx = _fill_cstate(dev, 0, "MPU ON + CORE ON");
+	(&dev->states[0])->enter = omap3_enter_idle;
+	dev->safe_state = &dev->states[0];
+	cx->valid = 1;	/* C1 is always valid */
+	cx->mpu_state = PWRDM_POWER_ON;
+	cx->core_state = PWRDM_POWER_ON;
 
-	if (!count)
-		return -EINVAL;
-	dev->state_count = count;
+	/* C2 . MPU WFI + Core inactive */
+	cx = _fill_cstate(dev, 1, "MPU ON + CORE ON");
+	cx->mpu_state = PWRDM_POWER_ON;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C3 . MPU CSWR + Core inactive */
+	cx = _fill_cstate(dev, 2, "MPU RET + CORE ON");
+	cx->mpu_state = PWRDM_POWER_RET;
+	cx->core_state = PWRDM_POWER_ON;
 
-	if (enable_off_mode)
-		omap3_cpuidle_update_states(PWRDM_POWER_OFF, PWRDM_POWER_OFF);
-	else
-		omap3_cpuidle_update_states(PWRDM_POWER_RET, PWRDM_POWER_RET);
+	/* C4 . MPU OFF + Core inactive */
+	cx = _fill_cstate(dev, 3, "MPU OFF + CORE ON");
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C5 . MPU RET + Core RET */
+	cx = _fill_cstate(dev, 4, "MPU RET + CORE RET");
+	cx->mpu_state = PWRDM_POWER_RET;
+	cx->core_state = PWRDM_POWER_RET;
+
+	/* C6 . MPU OFF + Core RET */
+	cx = _fill_cstate(dev, 5, "MPU OFF + CORE RET");
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_RET;
+
+	/* C7 . MPU OFF + Core OFF */
+	cx = _fill_cstate(dev, 6, "MPU OFF + CORE OFF");
+	/*
+	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
+	 * enable OFF mode in a stable form for previous revisions.
+	 * We disable C7 state as a result.
+	 */
+	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) {
+		cx->valid = 0;
+		pr_warn("%s: core off state C7 disabled due to i583\n",
+			__func__);
+	}
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_OFF;
 
+	dev->state_count = OMAP3_NUM_STATES;
 	if (cpuidle_register_device(dev)) {
 		printk(KERN_ERR "%s: CPUidle register device failed\n",
 		       __func__);
diff --git a/arch/arm/mach-omap2/gpmc-smc91x.c b/arch/arm/mach-omap2/gpmc-smc91x.c
index 877c6f5..ba10c24 100644
--- a/arch/arm/mach-omap2/gpmc-smc91x.c
+++ b/arch/arm/mach-omap2/gpmc-smc91x.c
@@ -147,25 +147,24 @@ void __init gpmc_smc91x_init(struct omap_smc91x_platform_data *board_data)
 			goto free1;
 	}
 
-	if (gpio_request(gpmc_cfg->gpio_irq, "SMC91X irq") < 0)
+	if (gpio_request_one(gpmc_cfg->gpio_irq, GPIOF_IN, "SMC91X irq") < 0)
 		goto free1;
 
-	gpio_direction_input(gpmc_cfg->gpio_irq);
 	gpmc_smc91x_resources[1].start = gpio_to_irq(gpmc_cfg->gpio_irq);
 
 	if (gpmc_cfg->gpio_pwrdwn) {
-		ret = gpio_request(gpmc_cfg->gpio_pwrdwn, "SMC91X powerdown");
+		ret = gpio_request_one(gpmc_cfg->gpio_pwrdwn,
+				       GPIOF_OUT_INIT_LOW, "SMC91X powerdown");
 		if (ret)
 			goto free2;
-		gpio_direction_output(gpmc_cfg->gpio_pwrdwn, 0);
 	}
 
 	if (gpmc_cfg->gpio_reset) {
-		ret = gpio_request(gpmc_cfg->gpio_reset, "SMC91X reset");
+		ret = gpio_request_one(gpmc_cfg->gpio_reset,
+				       GPIOF_OUT_INIT_LOW, "SMC91X reset");
 		if (ret)
 			goto free3;
 
-		gpio_direction_output(gpmc_cfg->gpio_reset, 0);
 		gpio_set_value(gpmc_cfg->gpio_reset, 1);
 		msleep(100);
 		gpio_set_value(gpmc_cfg->gpio_reset, 0);
diff --git a/arch/arm/mach-omap2/gpmc-smsc911x.c b/arch/arm/mach-omap2/gpmc-smsc911x.c
index 703f150..9970331 100644
--- a/arch/arm/mach-omap2/gpmc-smsc911x.c
+++ b/arch/arm/mach-omap2/gpmc-smsc911x.c
@@ -10,6 +10,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#define pr_fmt(fmt) "%s: " fmt, __func__
 
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
@@ -30,7 +31,7 @@ static struct resource gpmc_smsc911x_resources[] = {
 		.flags		= IORESOURCE_MEM,
 	},
 	[1] = {
-		.flags		= IORESOURCE_IRQ,
+		.flags		= IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
 	},
 };
 
@@ -41,16 +42,6 @@ static struct smsc911x_platform_config gpmc_smsc911x_config = {
 	.flags		= SMSC911X_USE_16BIT,
 };
 
-static struct platform_device gpmc_smsc911x_device = {
-	.name		= "smsc911x",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(gpmc_smsc911x_resources),
-	.resource	= gpmc_smsc911x_resources,
-	.dev		= {
-		.platform_data = &gpmc_smsc911x_config,
-	},
-};
-
 /*
  * Initialize smsc911x device connected to the GPMC. Note that we
  * assume that pin multiplexing is done in the board-*.c file,
@@ -58,46 +49,49 @@ static struct platform_device gpmc_smsc911x_device = {
  */
 void __init gpmc_smsc911x_init(struct omap_smsc911x_platform_data *board_data)
 {
+	struct platform_device *pdev;
 	unsigned long cs_mem_base;
 	int ret;
 
 	gpmc_cfg = board_data;
 
 	if (gpmc_cs_request(gpmc_cfg->cs, SZ_16M, &cs_mem_base) < 0) {
-		printk(KERN_ERR "Failed to request GPMC mem for smsc911x\n");
+		pr_err("Failed to request GPMC mem region\n");
 		return;
 	}
 
 	gpmc_smsc911x_resources[0].start = cs_mem_base + 0x0;
 	gpmc_smsc911x_resources[0].end = cs_mem_base + 0xff;
 
-	if (gpio_request(gpmc_cfg->gpio_irq, "smsc911x irq") < 0) {
-		printk(KERN_ERR "Failed to request GPIO%d for smsc911x IRQ\n",
-				gpmc_cfg->gpio_irq);
+	if (gpio_request_one(gpmc_cfg->gpio_irq, GPIOF_IN, "smsc911x irq")) {
+		pr_err("Failed to request IRQ GPIO%d\n", gpmc_cfg->gpio_irq);
 		goto free1;
 	}
 
-	gpio_direction_input(gpmc_cfg->gpio_irq);
 	gpmc_smsc911x_resources[1].start = gpio_to_irq(gpmc_cfg->gpio_irq);
-	gpmc_smsc911x_resources[1].flags |=
-					(gpmc_cfg->flags & IRQF_TRIGGER_MASK);
 
 	if (gpio_is_valid(gpmc_cfg->gpio_reset)) {
-		ret = gpio_request(gpmc_cfg->gpio_reset, "smsc911x reset");
+		ret = gpio_request_one(gpmc_cfg->gpio_reset,
+				       GPIOF_OUT_INIT_HIGH, "smsc911x reset");
 		if (ret) {
-			printk(KERN_ERR "Failed to request GPIO%d for smsc911x reset\n",
-					gpmc_cfg->gpio_reset);
+			pr_err("Failed to request reset GPIO%d\n",
+			       gpmc_cfg->gpio_reset);
 			goto free2;
 		}
 
-		gpio_direction_output(gpmc_cfg->gpio_reset, 1);
 		gpio_set_value(gpmc_cfg->gpio_reset, 0);
 		msleep(100);
 		gpio_set_value(gpmc_cfg->gpio_reset, 1);
 	}
 
-	if (platform_device_register(&gpmc_smsc911x_device) < 0) {
-		printk(KERN_ERR "Unable to register smsc911x device\n");
+	if (gpmc_cfg->flags)
+		gpmc_smsc911x_config.flags = gpmc_cfg->flags;
+
+	pdev = platform_device_register_resndata(NULL, "smsc911x", gpmc_cfg->id,
+		 gpmc_smsc911x_resources, ARRAY_SIZE(gpmc_smsc911x_resources),
+		 &gpmc_smsc911x_config, sizeof(gpmc_smsc911x_config));
+	if (!pdev) {
+		pr_err("Unable to register platform device\n");
 		gpio_free(gpmc_cfg->gpio_reset);
 		goto free2;
 	}
@@ -109,5 +103,5 @@ free2:
 free1:
 	gpmc_cs_free(gpmc_cfg->cs);
 
-	printk(KERN_ERR "Could not initialize smsc911x\n");
+	pr_err("Could not initialize smsc911x device\n");
 }
diff --git a/arch/arm/mach-omap2/omap_l3_noc.c b/arch/arm/mach-omap2/omap_l3_noc.c
index 82632c2..7b9f190 100644
--- a/arch/arm/mach-omap2/omap_l3_noc.c
+++ b/arch/arm/mach-omap2/omap_l3_noc.c
@@ -63,10 +63,7 @@ static irqreturn_t l3_interrupt_handler(int irq, void *_l3)
 	char *source_name;
 
 	/* Get the Type of interrupt */
-	if (irq == l3->app_irq)
-		inttype = L3_APPLICATION_ERROR;
-	else
-		inttype = L3_DEBUG_ERROR;
+	inttype = irq == l3->app_irq ? L3_APPLICATION_ERROR : L3_DEBUG_ERROR;
 
 	for (i = 0; i < L3_MODULES; i++) {
 		/*
@@ -84,10 +81,10 @@ static irqreturn_t l3_interrupt_handler(int irq, void *_l3)
 
 			err_src = j;
 			/* Read the stderrlog_main_source from clk domain */
-			std_err_main_addr = base + (*(l3_targ[i] + err_src));
-			std_err_main =  readl(std_err_main_addr);
+			std_err_main_addr = base + *(l3_targ[i] + err_src);
+			std_err_main = readl(std_err_main_addr);
 
-			switch ((std_err_main & CUSTOM_ERROR)) {
+			switch (std_err_main & CUSTOM_ERROR) {
 			case STANDARD_ERROR:
 				source_name =
 				l3_targ_stderrlog_main_name[i][err_src];
@@ -132,49 +129,49 @@ static int __init omap4_l3_probe(struct platform_device *pdev)
 
 	l3 = kzalloc(sizeof(*l3), GFP_KERNEL);
 	if (!l3)
-		ret = -ENOMEM;
+		return -ENOMEM;
 
 	platform_set_drvdata(pdev, l3);
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "couldn't find resource 0\n");
 		ret = -ENODEV;
-		goto err1;
+		goto err0;
 	}
 
 	l3->l3_base[0] = ioremap(res->start, resource_size(res));
-	if (!(l3->l3_base[0])) {
+	if (!l3->l3_base[0]) {
 		dev_err(&pdev->dev, "ioremap failed\n");
 		ret = -ENOMEM;
-		goto err2;
+		goto err0;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	if (!res) {
 		dev_err(&pdev->dev, "couldn't find resource 1\n");
 		ret = -ENODEV;
-		goto err3;
+		goto err1;
 	}
 
 	l3->l3_base[1] = ioremap(res->start, resource_size(res));
-	if (!(l3->l3_base[1])) {
+	if (!l3->l3_base[1]) {
 		dev_err(&pdev->dev, "ioremap failed\n");
 		ret = -ENOMEM;
-		goto err4;
+		goto err1;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 2);
 	if (!res) {
 		dev_err(&pdev->dev, "couldn't find resource 2\n");
 		ret = -ENODEV;
-		goto err5;
+		goto err2;
 	}
 
 	l3->l3_base[2] = ioremap(res->start, resource_size(res));
-	if (!(l3->l3_base[2])) {
+	if (!l3->l3_base[2]) {
 		dev_err(&pdev->dev, "ioremap failed\n");
 		ret = -ENOMEM;
-		goto err6;
+		goto err2;
 	}
 
 	/*
@@ -187,7 +184,7 @@ static int __init omap4_l3_probe(struct platform_device *pdev)
 	if (ret) {
 		pr_crit("L3: request_irq failed to register for 0x%x\n",
 					 OMAP44XX_IRQ_L3_DBG);
-		goto err7;
+		goto err3;
 	}
 	l3->debug_irq = irq;
 
@@ -198,24 +195,22 @@ static int __init omap4_l3_probe(struct platform_device *pdev)
 	if (ret) {
 		pr_crit("L3: request_irq failed to register for 0x%x\n",
 					 OMAP44XX_IRQ_L3_APP);
-		goto err8;
+		goto err4;
 	}
 	l3->app_irq = irq;
 
-	goto err0;
-err8:
-err7:
-	iounmap(l3->l3_base[2]);
-err6:
-err5:
-	iounmap(l3->l3_base[1]);
+	return 0;
+
 err4:
+	free_irq(l3->debug_irq, l3);
 err3:
-	iounmap(l3->l3_base[0]);
+	iounmap(l3->l3_base[2]);
 err2:
+	iounmap(l3->l3_base[1]);
 err1:
-	kfree(l3);
+	iounmap(l3->l3_base[0]);
 err0:
+	kfree(l3);
 	return ret;
 }
 
diff --git a/arch/arm/mach-omap2/omap_l3_smx.c b/arch/arm/mach-omap2/omap_l3_smx.c
index 4321e79..873c0e3 100644
--- a/arch/arm/mach-omap2/omap_l3_smx.c
+++ b/arch/arm/mach-omap2/omap_l3_smx.c
@@ -155,7 +155,7 @@ static irqreturn_t omap3_l3_block_irq(struct omap3_l3 *l3,
 	u8                      multi = error & L3_ERROR_LOG_MULTI;
 	u32			address = omap3_l3_decode_addr(error_addr);
 
-	WARN(true, "%s Error seen by %s %s at address %x\n",
+	WARN(true, "%s seen by %s %s at address %x\n",
 				 omap3_l3_code_string(code),
 			  omap3_l3_initiator_string(initid),
 			     multi ? "Multiple Errors" : "",
@@ -167,21 +167,15 @@ static irqreturn_t omap3_l3_block_irq(struct omap3_l3 *l3,
 static irqreturn_t omap3_l3_app_irq(int irq, void *_l3)
 {
 	struct omap3_l3         *l3 = _l3;
-
 	u64                     status, clear;
 	u64                     error;
 	u64			error_addr;
 	u64			err_source = 0;
 	void			__iomem *base;
 	int			int_type;
-
 	irqreturn_t             ret = IRQ_NONE;
 
-	if (irq == l3->app_irq)
-		int_type = L3_APPLICATION_ERROR;
-	else
-		int_type = L3_DEBUG_ERROR;
-
+	int_type = irq == l3->app_irq ? L3_APPLICATION_ERROR : L3_DEBUG_ERROR;
 	if (!int_type) {
 		status = omap3_l3_readll(l3->rt, L3_SI_FLAG_STATUS_0);
 		/*
@@ -202,7 +196,6 @@ static irqreturn_t omap3_l3_app_irq(int irq, void *_l3)
 
 	base = l3->rt + *(omap3_l3_bases[int_type] + err_source);
 	error = omap3_l3_readll(base, L3_ERROR_LOG);
-
 	if (error) {
 		error_addr = omap3_l3_readll(base, L3_ERROR_LOG_ADDR);
 
@@ -210,9 +203,8 @@ static irqreturn_t omap3_l3_app_irq(int irq, void *_l3)
 	}
 
 	/* Clear the status register */
-	clear = ((L3_AGENT_STATUS_CLEAR_IA << int_type) |
-		 (L3_AGENT_STATUS_CLEAR_TA));
-
+	clear = (L3_AGENT_STATUS_CLEAR_IA << int_type) |
+		L3_AGENT_STATUS_CLEAR_TA;
 	omap3_l3_writell(base, L3_AGENT_STATUS, clear);
 
 	/* clear the error log register */
@@ -228,10 +220,8 @@ static int __init omap3_l3_probe(struct platform_device *pdev)
 	int                     ret;
 
 	l3 = kzalloc(sizeof(*l3), GFP_KERNEL);
-	if (!l3) {
-		ret = -ENOMEM;
-		goto err0;
-	}
+	if (!l3)
+		return -ENOMEM;
 
 	platform_set_drvdata(pdev, l3);
 
@@ -239,13 +229,13 @@ static int __init omap3_l3_probe(struct platform_device *pdev)
 	if (!res) {
 		dev_err(&pdev->dev, "couldn't find resource\n");
 		ret = -ENODEV;
-		goto err1;
+		goto err0;
 	}
 	l3->rt = ioremap(res->start, resource_size(res));
-	if (!(l3->rt)) {
+	if (!l3->rt) {
 		dev_err(&pdev->dev, "ioremap failed\n");
 		ret = -ENOMEM;
-		goto err2;
+		goto err0;
 	}
 
 	l3->debug_irq = platform_get_irq(pdev, 0);
@@ -254,28 +244,26 @@ static int __init omap3_l3_probe(struct platform_device *pdev)
 		"l3-debug-irq", l3);
 	if (ret) {
 		dev_err(&pdev->dev, "couldn't request debug irq\n");
-		goto err3;
+		goto err1;
 	}
 
 	l3->app_irq = platform_get_irq(pdev, 1);
 	ret = request_irq(l3->app_irq, omap3_l3_app_irq,
 		IRQF_DISABLED | IRQF_TRIGGER_RISING,
 		"l3-app-irq", l3);
-
 	if (ret) {
 		dev_err(&pdev->dev, "couldn't request app irq\n");
-		goto err4;
+		goto err2;
 	}
 
-	goto err0;
+	return 0;
 
-err4:
-err3:
-	iounmap(l3->rt);
 err2:
+	free_irq(l3->debug_irq, l3);
 err1:
-	kfree(l3);
+	iounmap(l3->rt);
 err0:
+	kfree(l3);
 	return ret;
 }
 
diff --git a/arch/arm/mach-omap2/omap_phy_internal.c b/arch/arm/mach-omap2/omap_phy_internal.c
index 05f6abc..f47813e 100644
--- a/arch/arm/mach-omap2/omap_phy_internal.c
+++ b/arch/arm/mach-omap2/omap_phy_internal.c
@@ -50,13 +50,16 @@ int omap4430_phy_init(struct device *dev)
 {
 	ctrl_base = ioremap(OMAP443X_SCM_BASE, SZ_1K);
 	if (!ctrl_base) {
-		dev_err(dev, "control module ioremap failed\n");
+		pr_err("control module ioremap failed\n");
 		return -ENOMEM;
 	}
 	/* Power down the phy */
 	__raw_writel(PHY_PD, ctrl_base + CONTROL_DEV_CONF);
-	phyclk = clk_get(dev, "ocp2scp_usb_phy_ick");
 
+	if (!dev)
+		return 0;
+
+	phyclk = clk_get(dev, "ocp2scp_usb_phy_ick");
 	if (IS_ERR(phyclk)) {
 		dev_err(dev, "cannot clk_get ocp2scp_usb_phy_ick\n");
 		iounmap(ctrl_base);
@@ -228,7 +231,7 @@ void am35x_musb_clear_irq(void)
 	regval = omap_ctrl_readl(AM35XX_CONTROL_LVL_INTR_CLEAR);
 }
 
-void am35x_musb_set_mode(u8 musb_mode)
+void am35x_set_mode(u8 musb_mode)
 {
 	u32 devconf2 = omap_ctrl_readl(AM35XX_CONTROL_DEVCONF2);
 
diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h
index 797bfd1..45bcfce 100644
--- a/arch/arm/mach-omap2/pm.h
+++ b/arch/arm/mach-omap2/pm.h
@@ -36,11 +36,16 @@ static inline int omap4_opp_init(void)
 }
 #endif
 
+/*
+ * cpuidle mach specific parameters
+ *
+ * The board code can override the default C-states definition using
+ * omap3_pm_init_cpuidle
+ */
 struct cpuidle_params {
-	u8  valid;
-	u32 sleep_latency;
-	u32 wake_latency;
-	u32 threshold;
+	u32 exit_latency;	/* exit_latency = sleep + wake-up latencies */
+	u32 target_residency;
+	u8 valid;		/* validates the C-state */
 };
 
 #if defined(CONFIG_PM) && defined(CONFIG_CPU_IDLE)
@@ -73,10 +78,6 @@ extern u32 sleep_while_idle;
 #define sleep_while_idle 0
 #endif
 
-#if defined(CONFIG_CPU_IDLE)
-extern void omap3_cpuidle_update_states(u32, u32);
-#endif
-
 #if defined(CONFIG_PM_DEBUG) && defined(CONFIG_DEBUG_FS)
 extern void pm_dbg_update_time(struct powerdomain *pwrdm, int prev);
 extern int pm_dbg_regset_save(int reg_set);
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 0c5e3a4..c155c9d 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -779,18 +779,6 @@ void omap3_pm_off_mode_enable(int enable)
 	else
 		state = PWRDM_POWER_RET;
 
-#ifdef CONFIG_CPU_IDLE
-	/*
-	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
-	 * enable OFF mode in a stable form for previous revisions, restrict
-	 * instead to RET
-	 */
-	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583))
-		omap3_cpuidle_update_states(state, PWRDM_POWER_RET);
-	else
-		omap3_cpuidle_update_states(state, state);
-#endif
-
 	list_for_each_entry(pwrst, &pwrst_list, node) {
 		if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583) &&
 				pwrst->pwrdm == core_pwrdm &&
@@ -895,8 +883,6 @@ static int __init omap3_pm_init(void)
 
 	pm_errata_configure();
 
-	printk(KERN_ERR "Power Management for TI OMAP3.\n");
-
 	/* XXX prcm_setup_regs needs to be before enabling hw
 	 * supervised mode for powerdomains */
 	prcm_setup_regs();
diff --git a/arch/arm/mach-omap2/pm44xx.c b/arch/arm/mach-omap2/pm44xx.c
index 76cfff2..59a870b 100644
--- a/arch/arm/mach-omap2/pm44xx.c
+++ b/arch/arm/mach-omap2/pm44xx.c
@@ -105,13 +105,11 @@ static int __init omap4_pm_init(void)
 
 	pr_err("Power Management for TI OMAP4.\n");
 
-#ifdef CONFIG_PM
 	ret = pwrdm_for_each(pwrdms_setup, NULL);
 	if (ret) {
 		pr_err("Failed to setup powerdomains\n");
 		goto err2;
 	}
-#endif
 
 #ifdef CONFIG_SUSPEND
 	suspend_set_ops(&omap_pm_ops);
diff --git a/arch/arm/mach-omap2/smartreflex.c b/arch/arm/mach-omap2/smartreflex.c
index 13e24f9..fb7dc52 100644
--- a/arch/arm/mach-omap2/smartreflex.c
+++ b/arch/arm/mach-omap2/smartreflex.c
@@ -847,6 +847,14 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 		goto err_free_devinfo;
 	}
 
+	mem = request_mem_region(mem->start, resource_size(mem),
+					dev_name(&pdev->dev));
+	if (!mem) {
+		dev_err(&pdev->dev, "%s: no mem region\n", __func__);
+		ret = -EBUSY;
+		goto err_free_devinfo;
+	}
+
 	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 
 	pm_runtime_enable(&pdev->dev);
@@ -883,7 +891,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 		ret = sr_late_init(sr_info);
 		if (ret) {
 			pr_warning("%s: Error in SR late init\n", __func__);
-			goto err_release_region;
+			return ret;
 		}
 	}
 
@@ -896,7 +904,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 	vdd_dbg_dir = omap_voltage_get_dbgdir(sr_info->voltdm);
 	if (!vdd_dbg_dir) {
 		ret = -EINVAL;
-		goto err_release_region;
+		goto err_iounmap;
 	}
 
 	sr_info->dbg_dir = debugfs_create_dir("smartreflex", vdd_dbg_dir);
@@ -904,7 +912,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "%s: Unable to create debugfs directory\n",
 			__func__);
 		ret = PTR_ERR(sr_info->dbg_dir);
-		goto err_release_region;
+		goto err_iounmap;
 	}
 
 	(void) debugfs_create_file("autocomp", S_IRUGO | S_IWUSR,
@@ -921,7 +929,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "%s: Unable to create debugfs directory"
 			"for n-values\n", __func__);
 		ret = PTR_ERR(nvalue_dir);
-		goto err_release_region;
+		goto err_debugfs;
 	}
 
 	omap_voltage_get_volttable(sr_info->voltdm, &volt_data);
@@ -931,7 +939,7 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 			"entries for n-values\n",
 			__func__, sr_info->voltdm->name);
 		ret = -ENODATA;
-		goto err_release_region;
+		goto err_debugfs;
 	}
 
 	for (i = 0; i < sr_info->nvalue_count; i++) {
@@ -945,6 +953,11 @@ static int __init omap_sr_probe(struct platform_device *pdev)
 
 	return ret;
 
+err_debugfs:
+	debugfs_remove_recursive(sr_info->dbg_dir);
+err_iounmap:
+	list_del(&sr_info->node);
+	iounmap(sr_info->base);
 err_release_region:
 	release_mem_region(mem->start, resource_size(mem));
 err_free_devinfo:
diff --git a/arch/arm/mach-omap2/usb-musb.c b/arch/arm/mach-omap2/usb-musb.c
index 35559f7..c7ed540 100644
--- a/arch/arm/mach-omap2/usb-musb.c
+++ b/arch/arm/mach-omap2/usb-musb.c
@@ -108,7 +108,13 @@ static void usb_musb_mux_init(struct omap_musb_board_data *board_data)
 	}
 }
 
-void __init usb_musb_init(struct omap_musb_board_data *board_data)
+static struct omap_musb_board_data musb_default_board_data = {
+	.interface_type		= MUSB_INTERFACE_ULPI,
+	.mode			= MUSB_OTG,
+	.power			= 100,
+};
+
+void __init usb_musb_init(struct omap_musb_board_data *musb_board_data)
 {
 	struct omap_hwmod		*oh;
 	struct omap_device		*od;
@@ -116,11 +122,12 @@ void __init usb_musb_init(struct omap_musb_board_data *board_data)
 	struct device			*dev;
 	int				bus_id = -1;
 	const char			*oh_name, *name;
+	struct omap_musb_board_data	*board_data;
 
-	if (cpu_is_omap3517() || cpu_is_omap3505()) {
-	} else if (cpu_is_omap44xx()) {
-		usb_musb_mux_init(board_data);
-	}
+	if (musb_board_data)
+		board_data = musb_board_data;
+	else
+		board_data = &musb_default_board_data;
 
 	/*
 	 * REVISIT: This line can be removed once all the platforms using
@@ -164,10 +171,15 @@ void __init usb_musb_init(struct omap_musb_board_data *board_data)
 	dev->dma_mask = &musb_dmamask;
 	dev->coherent_dma_mask = musb_dmamask;
 	put_device(dev);
+
+	if (cpu_is_omap44xx())
+		omap4430_phy_init(dev);
 }
 
 #else
 void __init usb_musb_init(struct omap_musb_board_data *board_data)
 {
+	if (cpu_is_omap44xx())
+		omap4430_phy_init(NULL);
 }
 #endif /* CONFIG_USB_MUSB_SOC */
diff --git a/arch/arm/mach-omap2/usb-tusb6010.c b/arch/arm/mach-omap2/usb-tusb6010.c
index 8a3c05f..8dd26b7 100644
--- a/arch/arm/mach-omap2/usb-tusb6010.c
+++ b/arch/arm/mach-omap2/usb-tusb6010.c
@@ -293,12 +293,11 @@ tusb6010_setup_interface(struct musb_hdrc_platform_data *data,
 			);
 
 	/* IRQ */
-	status = gpio_request(irq, "TUSB6010 irq");
+	status = gpio_request_one(irq, GPIOF_IN, "TUSB6010 irq");
 	if (status < 0) {
 		printk(error, 3, status);
 		return status;
 	}
-	gpio_direction_input(irq);
 	tusb_resources[2].start = irq + IH_GPIO_BASE;
 
 	/* set up memory timings ... can speed them up later */
diff --git a/arch/arm/mach-omap2/voltage.c b/arch/arm/mach-omap2/voltage.c
index 0c1552d..9ef3789 100644
--- a/arch/arm/mach-omap2/voltage.c
+++ b/arch/arm/mach-omap2/voltage.c
@@ -148,7 +148,6 @@ static int vp_volt_debug_get(void *data, u64 *val)
 	}
 
 	vsel = vdd->read_reg(prm_mod_offs, vdd->vp_data->voltage);
-	pr_notice("curr_vsel = %x\n", vsel);
 
 	if (!vdd->pmic_info->vsel_to_uv) {
 		pr_warning("PMIC function to convert vsel to voltage"
diff --git a/arch/arm/plat-omap/include/plat/gpmc-smsc911x.h b/arch/arm/plat-omap/include/plat/gpmc-smsc911x.h
index 872de0bf..ea6c9c8 100644
--- a/arch/arm/plat-omap/include/plat/gpmc-smsc911x.h
+++ b/arch/arm/plat-omap/include/plat/gpmc-smsc911x.h
@@ -14,14 +14,14 @@
 #ifndef __ASM_ARCH_OMAP_GPMC_SMSC911X_H__
 
 struct omap_smsc911x_platform_data {
+	int	id;
 	int	cs;
 	int	gpio_irq;
 	int	gpio_reset;
 	u32	flags;
 };
 
-#if defined(CONFIG_SMSC911X) || \
-	defined(CONFIG_SMSC911X_MODULE)
+#if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
 
 extern void gpmc_smsc911x_init(struct omap_smsc911x_platform_data *d);
 
diff --git a/arch/arm/plat-omap/include/plat/uncompress.h b/arch/arm/plat-omap/include/plat/uncompress.h
index 565d266..ac4b60d 100644
--- a/arch/arm/plat-omap/include/plat/uncompress.h
+++ b/arch/arm/plat-omap/include/plat/uncompress.h
@@ -129,7 +129,6 @@ static inline void __arch_decomp_setup(unsigned long arch_id)
 		DEBUG_LL_OMAP1(3, sx1);
 
 		/* omap2 based boards using UART1 */
-		DEBUG_LL_OMAP2(1, omap2evm);
 		DEBUG_LL_OMAP2(1, omap_2430sdp);
 		DEBUG_LL_OMAP2(1, omap_apollon);
 		DEBUG_LL_OMAP2(1, omap_h4);
diff --git a/arch/arm/plat-omap/include/plat/usb.h b/arch/arm/plat-omap/include/plat/usb.h
index 02b96c8..17d3c93 100644
--- a/arch/arm/plat-omap/include/plat/usb.h
+++ b/arch/arm/plat-omap/include/plat/usb.h
@@ -113,7 +113,7 @@ extern int omap4430_phy_suspend(struct device *dev, int suspend);
 extern void am35x_musb_reset(void);
 extern void am35x_musb_phy_power(u8 on);
 extern void am35x_musb_clear_irq(void);
-extern void am35x_musb_set_mode(u8 musb_mode);
+extern void am35x_set_mode(u8 musb_mode);
 
 /*
  * FIXME correct answer depends on hmc_mode,
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 423145a6..2f6a22e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
 	select HAVE_RCU_TABLE_FREE if SMP
+	select HAVE_SYSCALL_TRACEPOINTS
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/boot/dts/canyonlands.dts b/arch/powerpc/boot/dts/canyonlands.dts
index 2779f08..22dd6ae 100644
--- a/arch/powerpc/boot/dts/canyonlands.dts
+++ b/arch/powerpc/boot/dts/canyonlands.dts
@@ -530,5 +530,23 @@
 				0x0 0x0 0x0 0x3 &UIC3 0x12 0x4 /* swizzled int C */
 				0x0 0x0 0x0 0x4 &UIC3 0x13 0x4 /* swizzled int D */>;
 		};
+
+		MSI: ppc4xx-msi@C10000000 {
+			compatible = "amcc,ppc4xx-msi", "ppc4xx-msi";
+			reg = < 0xC 0x10000000 0x100>;
+			sdr-base = <0x36C>;
+			msi-data = <0x00000000>;
+			msi-mask = <0x44440000>;
+			interrupt-count = <3>;
+			interrupts = <0 1 2 3>;
+			interrupt-parent = <&UIC3>;
+			#interrupt-cells = <1>;
+			#address-cells = <0>;
+			#size-cells = <0>;
+			interrupt-map = <0 &UIC3 0x18 1
+					1 &UIC3 0x19 1
+					2 &UIC3 0x1A 1
+					3 &UIC3 0x1B 1>;
+		};
 	};
 };
diff --git a/arch/powerpc/boot/dts/katmai.dts b/arch/powerpc/boot/dts/katmai.dts
index 7c3be5e..f913dbe 100644
--- a/arch/powerpc/boot/dts/katmai.dts
+++ b/arch/powerpc/boot/dts/katmai.dts
@@ -442,6 +442,24 @@
 				0x0 0x0 0x0 0x4 &UIC3 0xb 0x4 /* swizzled int D */>;
 		};
 
+		MSI: ppc4xx-msi@400300000 {
+				compatible = "amcc,ppc4xx-msi", "ppc4xx-msi";
+				reg = < 0x4 0x00300000 0x100>;
+				sdr-base = <0x3B0>;
+				msi-data = <0x00000000>;
+				msi-mask = <0x44440000>;
+				interrupt-count = <3>;
+				interrupts =<0 1 2 3>;
+				interrupt-parent = <&UIC0>;
+				#interrupt-cells = <1>;
+				#address-cells = <0>;
+				#size-cells = <0>;
+				interrupt-map = <0 &UIC0 0xC 1
+					1 &UIC0 0x0D 1
+					2 &UIC0 0x0E 1
+					3 &UIC0 0x0F 1>;
+		};
+
 		I2O: i2o@400100000 {
 			compatible = "ibm,i2o-440spe";
 			reg = <0x00000004 0x00100000 0x100>;
diff --git a/arch/powerpc/boot/dts/kilauea.dts b/arch/powerpc/boot/dts/kilauea.dts
index 89edb16..1613d6e 100644
--- a/arch/powerpc/boot/dts/kilauea.dts
+++ b/arch/powerpc/boot/dts/kilauea.dts
@@ -403,5 +403,33 @@
 				0x0 0x0 0x0 0x3 &UIC2 0xd 0x4 /* swizzled int C */
 				0x0 0x0 0x0 0x4 &UIC2 0xe 0x4 /* swizzled int D */>;
 		};
+
+		MSI: ppc4xx-msi@C10000000 {
+			compatible = "amcc,ppc4xx-msi", "ppc4xx-msi";
+			reg = < 0x0 0xEF620000 0x100>;
+			sdr-base = <0x4B0>;
+			msi-data = <0x00000000>;
+			msi-mask = <0x44440000>;
+			interrupt-count = <12>;
+			interrupts = <0 1 2 3 4 5 6 7 8 9 0xA 0xB 0xC 0xD>;
+			interrupt-parent = <&UIC2>;
+			#interrupt-cells = <1>;
+			#address-cells = <0>;
+			#size-cells = <0>;
+			interrupt-map = <0 &UIC2 0x10 1
+					1 &UIC2 0x11 1
+					2 &UIC2 0x12 1
+					2 &UIC2 0x13 1
+					2 &UIC2 0x14 1
+					2 &UIC2 0x15 1
+					2 &UIC2 0x16 1
+					2 &UIC2 0x17 1
+					2 &UIC2 0x18 1
+					2 &UIC2 0x19 1
+					2 &UIC2 0x1A 1
+					2 &UIC2 0x1B 1
+					2 &UIC2 0x1C 1
+					3 &UIC2 0x1D 1>;
+		};
 	};
 };
diff --git a/arch/powerpc/boot/dts/redwood.dts b/arch/powerpc/boot/dts/redwood.dts
index 81636c0..d86a3a4 100644
--- a/arch/powerpc/boot/dts/redwood.dts
+++ b/arch/powerpc/boot/dts/redwood.dts
@@ -358,8 +358,28 @@
 				0x0 0x0 0x0 0x4 &UIC3 0xb 0x4 /* swizzled int D */>;
 		};
 
+		MSI: ppc4xx-msi@400300000 {
+				compatible = "amcc,ppc4xx-msi", "ppc4xx-msi";
+				reg = < 0x4 0x00300000 0x100
+					0x4 0x00300000 0x100>;
+				sdr-base = <0x3B0>;
+				msi-data = <0x00000000>;
+				msi-mask = <0x44440000>;
+				interrupt-count = <3>;
+				interrupts =<0 1 2 3>;
+				interrupt-parent = <&UIC0>;
+				#interrupt-cells = <1>;
+				#address-cells = <0>;
+				#size-cells = <0>;
+				interrupt-map = <0 &UIC0 0xC 1
+					1 &UIC0 0x0D 1
+					2 &UIC0 0x0E 1
+					3 &UIC0 0x0F 1>;
+		};
+
 	};
 
+
 	chosen {
 		linux,stdout-path = "/plb/opb/serial@ef600200";
 	};
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index dde1296..169d039e 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -60,4 +60,18 @@ struct dyn_arch_ftrace {
 
 #endif
 
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) && !defined(__ASSEMBLY__)
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Compare the symbol name with the system call name. Skip the .sys or .SyS
+	 * prefix from the symbol name and the sys prefix from the system call name and
+	 * just match the rest. This is only needed on ppc64 since symbol names on
+	 * 32bit do not start with a period so the generic function will work.
+	 */
+	return !strcmp(sym + 4, name + 3);
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 && !__ASSEMBLY__ */
+
 #endif /* _ASM_POWERPC_FTRACE */
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 852b8c1..fd8201d 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -236,7 +236,7 @@
 #define H_HOME_NODE_ASSOCIATIVITY 0x2EC
 #define H_BEST_ENERGY		0x2F4
 #define H_GET_MPP_X		0x314
-#define MAX_HCALL_OPCODE	H_BEST_ENERGY
+#define MAX_HCALL_OPCODE	H_GET_MPP_X
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 880b8c1..11eb404 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -191,8 +191,6 @@ extern unsigned long __secondary_hold_spinloop;
 extern unsigned long __secondary_hold_acknowledge;
 extern char __secondary_hold;
 
-extern irqreturn_t debug_ipi_action(int irq, void *data);
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index 23913e9..b54b2ad 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -15,6 +15,11 @@
 
 #include <linux/sched.h>
 
+/* ftrace syscalls requires exporting the sys_call_table */
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern const unsigned long *sys_call_table;
+#endif /* CONFIG_FTRACE_SYSCALLS */
+
 static inline long syscall_get_nr(struct task_struct *task,
 				  struct pt_regs *regs)
 {
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 37c353e..836f231 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -110,7 +110,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NOERROR		12	/* Force successful syscall return */
 #define TIF_NOTIFY_RESUME	13	/* callback before returning to user */
 #define TIF_FREEZE		14	/* Freezing for suspend */
-#define TIF_RUNLATCH		15	/* Is the runlatch enabled? */
+#define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
+#define TIF_RUNLATCH		16	/* Is the runlatch enabled? */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -127,8 +128,10 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_RUNLATCH		(1<<TIF_RUNLATCH)
-#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
+#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
 				 _TIF_NOTIFY_RESUME)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9aab363..e8b9818 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -109,6 +109,7 @@ obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_callchain.o
 
 obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_event.o
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index ce1f3e44..bf99cfa 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -22,6 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
 #include <asm/ftrace.h>
+#include <asm/syscall.h>
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -600,3 +601,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
+unsigned long __init arch_syscall_addr(int nr)
+{
+	return sys_call_table[nr*2];
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a24d37d..5b428e3 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -295,17 +295,20 @@ static inline void handle_one_irq(unsigned int irq)
 	unsigned long saved_sp_limit;
 	struct irq_desc *desc;
 
+	desc = irq_to_desc(irq);
+	if (!desc)
+		return;
+
 	/* Switch to the irq stack to handle this */
 	curtp = current_thread_info();
 	irqtp = hardirq_ctx[smp_processor_id()];
 
 	if (curtp == irqtp) {
 		/* We're already on the irq stack, just handle it */
-		generic_handle_irq(irq);
+		desc->handle_irq(irq, desc);
 		return;
 	}
 
-	desc = irq_to_desc(irq);
 	saved_sp_limit = current->thread.ksp_limit;
 
 	irqtp->task = curtp->task;
@@ -557,15 +560,8 @@ struct irq_host *irq_alloc_host(struct device_node *of_node,
 	if (revmap_type == IRQ_HOST_MAP_LEGACY) {
 		if (irq_map[0].host != NULL) {
 			raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-			/* If we are early boot, we can't free the structure,
-			 * too bad...
-			 * this will be fixed once slab is made available early
-			 * instead of the current cruft
-			 */
-			if (mem_init_done) {
-				of_node_put(host->of_node);
-				kfree(host);
-			}
+			of_node_put(host->of_node);
+			kfree(host);
 			return NULL;
 		}
 		irq_map[0].host = host;
@@ -727,9 +723,7 @@ unsigned int irq_create_mapping(struct irq_host *host,
 	}
 	pr_debug("irq: -> using host @%p\n", host);
 
-	/* Check if mapping already exist, if it does, call
-	 * host->ops->map() to update the flags
-	 */
+	/* Check if mapping already exists */
 	virq = irq_find_mapping(host, hwirq);
 	if (virq != NO_IRQ) {
 		pr_debug("irq: -> existing mapping on virq %d\n", virq);
@@ -899,10 +893,13 @@ unsigned int irq_radix_revmap_lookup(struct irq_host *host,
 		return irq_find_mapping(host, hwirq);
 
 	/*
-	 * No rcu_read_lock(ing) needed, the ptr returned can't go under us
-	 * as it's referencing an entry in the static irq_map table.
+	 * The ptr returned references the static global irq_map.
+	 * but freeing an irq can delete nodes along the path to
+	 * do the lookup via call_rcu.
 	 */
+	rcu_read_lock();
 	ptr = radix_tree_lookup(&host->revmap_data.tree, hwirq);
+	rcu_read_unlock();
 
 	/*
 	 * If found in radix tree, then fine.
@@ -1010,14 +1007,23 @@ void irq_free_virt(unsigned int virq, unsigned int count)
 	WARN_ON (virq < NUM_ISA_INTERRUPTS);
 	WARN_ON (count == 0 || (virq + count) > irq_virq_count);
 
+	if (virq < NUM_ISA_INTERRUPTS) {
+		if (virq + count < NUM_ISA_INTERRUPTS)
+			return;
+		count  =- NUM_ISA_INTERRUPTS - virq;
+		virq = NUM_ISA_INTERRUPTS;
+	}
+
+	if (count > irq_virq_count || virq > irq_virq_count - count) {
+		if (virq > irq_virq_count)
+			return;
+		count = irq_virq_count - virq;
+	}
+
 	raw_spin_lock_irqsave(&irq_big_lock, flags);
 	for (i = virq; i < (virq + count); i++) {
 		struct irq_host *host;
 
-		if (i < NUM_ISA_INTERRUPTS ||
-		    (virq + count) > irq_virq_count)
-			continue;
-
 		host = irq_map[i].host;
 		irq_map[i].hwirq = host->inval_irq;
 		smp_wmb();
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index a6ae1cf..cb22024 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -29,6 +29,7 @@
 #include <linux/signal.h>
 #include <linux/seccomp.h>
 #include <linux/audit.h>
+#include <trace/syscall.h>
 #ifdef CONFIG_PPC32
 #include <linux/module.h>
 #endif
@@ -40,6 +41,9 @@
 #include <asm/pgtable.h>
 #include <asm/system.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 /*
  * The parameter save area on the stack is used to store arguments being passed
  * to callee function and is located at fixed offset from stack pointer.
@@ -1710,6 +1714,9 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->gpr[0]);
+
 	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
 		if (!is_32bit_task())
@@ -1738,6 +1745,9 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
 				   regs->result);
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->result);
+
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, step);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 4a6f2ec..8ebc670 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -129,7 +129,7 @@ static irqreturn_t call_function_single_action(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-irqreturn_t debug_ipi_action(int irq, void *data)
+static irqreturn_t debug_ipi_action(int irq, void *data)
 {
 	if (crash_ipi_function_ptr) {
 		crash_ipi_function_ptr(get_irq_regs());
diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index 8ee51a2..e6bec74 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -261,6 +261,28 @@ static int get_kernel(unsigned long pc, unsigned long mmcra)
 	return is_kernel;
 }
 
+static bool pmc_overflow(unsigned long val)
+{
+	if ((int)val < 0)
+		return true;
+
+	/*
+	 * Events on POWER7 can roll back if a speculative event doesn't
+	 * eventually complete. Unfortunately in some rare cases they will
+	 * raise a performance monitor exception. We need to catch this to
+	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
+	 * cycles from overflow.
+	 *
+	 * We only do this if the first pass fails to find any overflowing
+	 * PMCs because a user might set a period of less than 256 and we
+	 * don't want to mistakenly reset them.
+	 */
+	if (__is_processor(PV_POWER7) && ((0x80000000 - val) <= 256))
+		return true;
+
+	return false;
+}
+
 static void power4_handle_interrupt(struct pt_regs *regs,
 				    struct op_counter_config *ctr)
 {
@@ -281,7 +303,7 @@ static void power4_handle_interrupt(struct pt_regs *regs,
 
 	for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) {
 		val = classic_ctr_read(i);
-		if (val < 0) {
+		if (pmc_overflow(val)) {
 			if (oprofile_running && ctr[i].enabled) {
 				oprofile_add_ext_sample(pc, regs, i, is_kernel);
 				classic_ctr_write(i, reset_value[i]);
diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig
index b721764..d733d7c 100644
--- a/arch/powerpc/platforms/40x/Kconfig
+++ b/arch/powerpc/platforms/40x/Kconfig
@@ -57,6 +57,8 @@ config KILAUEA
 	select 405EX
 	select PPC40x_SIMPLE
 	select PPC4xx_PCI_EXPRESS
+	select PCI_MSI
+	select PPC4xx_MSI
 	help
 	  This option enables support for the AMCC PPC405EX evaluation board.
 
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index f485fc5f..e958b6f 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -74,6 +74,8 @@ config KATMAI
 	select 440SPe
 	select PCI
 	select PPC4xx_PCI_EXPRESS
+	select PCI_MSI
+	select PCC4xx_MSI
 	help
 	  This option enables support for the AMCC PPC440SPe evaluation board.
 
@@ -118,6 +120,8 @@ config CANYONLANDS
 	select 460EX
 	select PCI
 	select PPC4xx_PCI_EXPRESS
+	select PCI_MSI
+	select PPC4xx_MSI
 	select IBM_NEW_EMAC_RGMII
 	select IBM_NEW_EMAC_ZMII
 	help
@@ -144,6 +148,8 @@ config REDWOOD
 	select 460SX
 	select PCI
 	select PPC4xx_PCI_EXPRESS
+	select PCI_MSI
+	select PPC4xx_MSI
 	help
 	  This option enables support for the AMCC PPC460SX Redwood board.
 
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 449c08c..3e4eba6 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -176,14 +176,14 @@ EXPORT_SYMBOL_GPL(iic_get_target_id);
 #ifdef CONFIG_SMP
 
 /* Use the highest interrupt priorities for IPI */
-static inline int iic_ipi_to_irq(int ipi)
+static inline int iic_msg_to_irq(int msg)
 {
-	return IIC_IRQ_TYPE_IPI + 0xf - ipi;
+	return IIC_IRQ_TYPE_IPI + 0xf - msg;
 }
 
-void iic_cause_IPI(int cpu, int mesg)
+void iic_message_pass(int cpu, int msg)
 {
-	out_be64(&per_cpu(cpu_iic, cpu).regs->generate, (0xf - mesg) << 4);
+	out_be64(&per_cpu(cpu_iic, cpu).regs->generate, (0xf - msg) << 4);
 }
 
 struct irq_host *iic_get_irq_host(int node)
@@ -192,50 +192,31 @@ struct irq_host *iic_get_irq_host(int node)
 }
 EXPORT_SYMBOL_GPL(iic_get_irq_host);
 
-static irqreturn_t iic_ipi_action(int irq, void *dev_id)
-{
-	int ipi = (int)(long)dev_id;
-
-	switch(ipi) {
-	case PPC_MSG_CALL_FUNCTION:
-		generic_smp_call_function_interrupt();
-		break;
-	case PPC_MSG_RESCHEDULE:
-		scheduler_ipi();
-		break;
-	case PPC_MSG_CALL_FUNC_SINGLE:
-		generic_smp_call_function_single_interrupt();
-		break;
-	case PPC_MSG_DEBUGGER_BREAK:
-		debug_ipi_action(0, NULL);
-		break;
-	}
-	return IRQ_HANDLED;
-}
-static void iic_request_ipi(int ipi, const char *name)
+static void iic_request_ipi(int msg)
 {
 	int virq;
 
-	virq = irq_create_mapping(iic_host, iic_ipi_to_irq(ipi));
+	virq = irq_create_mapping(iic_host, iic_msg_to_irq(msg));
 	if (virq == NO_IRQ) {
 		printk(KERN_ERR
-		       "iic: failed to map IPI %s\n", name);
+		       "iic: failed to map IPI %s\n", smp_ipi_name[msg]);
 		return;
 	}
-	if (request_irq(virq, iic_ipi_action, IRQF_DISABLED, name,
-			(void *)(long)ipi))
-		printk(KERN_ERR
-		       "iic: failed to request IPI %s\n", name);
+
+	/*
+	 * If smp_request_message_ipi encounters an error it will notify
+	 * the error.  If a message is not needed it will return non-zero.
+	 */
+	if (smp_request_message_ipi(virq, msg))
+		irq_dispose_mapping(virq);
 }
 
 void iic_request_IPIs(void)
 {
-	iic_request_ipi(PPC_MSG_CALL_FUNCTION, "IPI-call");
-	iic_request_ipi(PPC_MSG_RESCHEDULE, "IPI-resched");
-	iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE, "IPI-call-single");
-#ifdef CONFIG_DEBUGGER
-	iic_request_ipi(PPC_MSG_DEBUGGER_BREAK, "IPI-debug");
-#endif /* CONFIG_DEBUGGER */
+	iic_request_ipi(PPC_MSG_CALL_FUNCTION);
+	iic_request_ipi(PPC_MSG_RESCHEDULE);
+	iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
+	iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
 }
 
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/platforms/cell/interrupt.h b/arch/powerpc/platforms/cell/interrupt.h
index 942dc39..4f60ae6 100644
--- a/arch/powerpc/platforms/cell/interrupt.h
+++ b/arch/powerpc/platforms/cell/interrupt.h
@@ -75,7 +75,7 @@ enum {
 };
 
 extern void iic_init_IRQ(void);
-extern void iic_cause_IPI(int cpu, int mesg);
+extern void iic_message_pass(int cpu, int msg);
 extern void iic_request_IPIs(void);
 extern void iic_setup_cpu(void);
 
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index d176e61..dbb641e 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -152,7 +152,7 @@ static int smp_cell_cpu_bootable(unsigned int nr)
 	return 1;
 }
 static struct smp_ops_t bpa_iic_smp_ops = {
-	.message_pass	= iic_cause_IPI,
+	.message_pass	= iic_message_pass,
 	.probe		= smp_iic_probe,
 	.kick_cpu	= smp_cell_kick_cpu,
 	.setup_cpu	= smp_cell_setup_cpu,
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index d775fd1..7b4df37 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -7,11 +7,18 @@ config PPC4xx_PCI_EXPRESS
 	depends on PCI && 4xx
 	default n
 
+config PPC4xx_MSI
+	bool
+	depends on PCI_MSI
+	depends on PCI && 4xx
+	default n
+
 config PPC_MSI_BITMAP
 	bool
 	depends on PCI_MSI
 	default y if MPIC
 	default y if FSL_PCI
+	default y if PPC4xx_MSI
 
 source "arch/powerpc/sysdev/xics/Kconfig"
 
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 6076e00..0efa990 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_OF_RTC)		+= of_rtc.o
 ifeq ($(CONFIG_PCI),y)
 obj-$(CONFIG_4xx)		+= ppc4xx_pci.o
 endif
+obj-$(CONFIG_PPC4xx_MSI)	+= ppc4xx_msi.o
 obj-$(CONFIG_PPC4xx_CPM)	+= ppc4xx_cpm.o
 obj-$(CONFIG_PPC4xx_GPIO)	+= ppc4xx_gpio.o
 
diff --git a/arch/powerpc/sysdev/ppc4xx_msi.c b/arch/powerpc/sysdev/ppc4xx_msi.c
new file mode 100644
index 0000000..367af02
--- /dev/null
+++ b/arch/powerpc/sysdev/ppc4xx_msi.c
@@ -0,0 +1,276 @@
+/*
+ * Adding PCI-E MSI support for PPC4XX SoCs.
+ *
+ * Copyright (c) 2010, Applied Micro Circuits Corporation
+ * Authors:	Tirumala R Marri <tmarri@apm.com>
+ *		Feng Kan <fkan@apm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ */
+
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/of_platform.h>
+#include <linux/interrupt.h>
+#include <asm/prom.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+#include <boot/dcr.h>
+#include <asm/dcr-regs.h>
+#include <asm/msi_bitmap.h>
+
+#define PEIH_TERMADH	0x00
+#define PEIH_TERMADL	0x08
+#define PEIH_MSIED	0x10
+#define PEIH_MSIMK	0x18
+#define PEIH_MSIASS	0x20
+#define PEIH_FLUSH0	0x30
+#define PEIH_FLUSH1	0x38
+#define PEIH_CNTRST	0x48
+#define NR_MSI_IRQS	4
+
+struct ppc4xx_msi {
+	u32 msi_addr_lo;
+	u32 msi_addr_hi;
+	void __iomem *msi_regs;
+	int msi_virqs[NR_MSI_IRQS];
+	struct msi_bitmap bitmap;
+	struct device_node *msi_dev;
+};
+
+static struct ppc4xx_msi ppc4xx_msi;
+
+static int ppc4xx_msi_init_allocator(struct platform_device *dev,
+		struct ppc4xx_msi *msi_data)
+{
+	int err;
+
+	err = msi_bitmap_alloc(&msi_data->bitmap, NR_MSI_IRQS,
+			      dev->dev.of_node);
+	if (err)
+		return err;
+
+	err = msi_bitmap_reserve_dt_hwirqs(&msi_data->bitmap);
+	if (err < 0) {
+		msi_bitmap_free(&msi_data->bitmap);
+		return err;
+	}
+
+	return 0;
+}
+
+static int ppc4xx_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int int_no = -ENOMEM;
+	unsigned int virq;
+	struct msi_msg msg;
+	struct msi_desc *entry;
+	struct ppc4xx_msi *msi_data = &ppc4xx_msi;
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		int_no = msi_bitmap_alloc_hwirqs(&msi_data->bitmap, 1);
+		if (int_no >= 0)
+			break;
+		if (int_no < 0) {
+			pr_debug("%s: fail allocating msi interrupt\n",
+					__func__);
+		}
+		virq = irq_of_parse_and_map(msi_data->msi_dev, int_no);
+		if (virq == NO_IRQ) {
+			dev_err(&dev->dev, "%s: fail mapping irq\n", __func__);
+			msi_bitmap_free_hwirqs(&msi_data->bitmap, int_no, 1);
+			return -ENOSPC;
+		}
+		dev_dbg(&dev->dev, "%s: virq = %d\n", __func__, virq);
+
+		/* Setup msi address space */
+		msg.address_hi = msi_data->msi_addr_hi;
+		msg.address_lo = msi_data->msi_addr_lo;
+
+		irq_set_msi_desc(virq, entry);
+		msg.data = int_no;
+		write_msi_msg(virq, &msg);
+	}
+	return 0;
+}
+
+void ppc4xx_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+	struct ppc4xx_msi *msi_data = &ppc4xx_msi;
+
+	dev_dbg(&dev->dev, "PCIE-MSI: tearing down msi irqs\n");
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+		irq_set_msi_desc(entry->irq, NULL);
+		msi_bitmap_free_hwirqs(&msi_data->bitmap,
+				virq_to_hw(entry->irq), 1);
+		irq_dispose_mapping(entry->irq);
+	}
+}
+
+static int ppc4xx_msi_check_device(struct pci_dev *pdev, int nvec, int type)
+{
+	dev_dbg(&pdev->dev, "PCIE-MSI:%s called. vec %x type %d\n",
+		__func__, nvec, type);
+	if (type == PCI_CAP_ID_MSIX)
+		pr_debug("ppc4xx msi: MSI-X untested, trying anyway.\n");
+
+	return 0;
+}
+
+static int ppc4xx_setup_pcieh_hw(struct platform_device *dev,
+				 struct resource res, struct ppc4xx_msi *msi)
+{
+	const u32 *msi_data;
+	const u32 *msi_mask;
+	const u32 *sdr_addr;
+	dma_addr_t msi_phys;
+	void *msi_virt;
+
+	sdr_addr = of_get_property(dev->dev.of_node, "sdr-base", NULL);
+	if (!sdr_addr)
+		return -1;
+
+	SDR0_WRITE(sdr_addr, (u64)res.start >> 32);	 /*HIGH addr */
+	SDR0_WRITE(sdr_addr + 1, res.start & 0xFFFFFFFF); /* Low addr */
+
+
+	msi->msi_dev = of_find_node_by_name(NULL, "ppc4xx-msi");
+	if (msi->msi_dev)
+		return -ENODEV;
+
+	msi->msi_regs = of_iomap(msi->msi_dev, 0);
+	if (!msi->msi_regs) {
+		dev_err(&dev->dev, "of_iomap problem failed\n");
+		return -ENOMEM;
+	}
+	dev_dbg(&dev->dev, "PCIE-MSI: msi register mapped 0x%x 0x%x\n",
+		(u32) (msi->msi_regs + PEIH_TERMADH), (u32) (msi->msi_regs));
+
+	msi_virt = dma_alloc_coherent(&dev->dev, 64, &msi_phys, GFP_KERNEL);
+	msi->msi_addr_hi = 0x0;
+	msi->msi_addr_lo = (u32) msi_phys;
+	dev_dbg(&dev->dev, "PCIE-MSI: msi address 0x%x\n", msi->msi_addr_lo);
+
+	/* Progam the Interrupt handler Termination addr registers */
+	out_be32(msi->msi_regs + PEIH_TERMADH, msi->msi_addr_hi);
+	out_be32(msi->msi_regs + PEIH_TERMADL, msi->msi_addr_lo);
+
+	msi_data = of_get_property(dev->dev.of_node, "msi-data", NULL);
+	if (!msi_data)
+		return -1;
+	msi_mask = of_get_property(dev->dev.of_node, "msi-mask", NULL);
+	if (!msi_mask)
+		return -1;
+	/* Program MSI Expected data and Mask bits */
+	out_be32(msi->msi_regs + PEIH_MSIED, *msi_data);
+	out_be32(msi->msi_regs + PEIH_MSIMK, *msi_mask);
+
+	return 0;
+}
+
+static int ppc4xx_of_msi_remove(struct platform_device *dev)
+{
+	struct ppc4xx_msi *msi = dev->dev.platform_data;
+	int i;
+	int virq;
+
+	for (i = 0; i < NR_MSI_IRQS; i++) {
+		virq = msi->msi_virqs[i];
+		if (virq != NO_IRQ)
+			irq_dispose_mapping(virq);
+	}
+
+	if (msi->bitmap.bitmap)
+		msi_bitmap_free(&msi->bitmap);
+	iounmap(msi->msi_regs);
+	of_node_put(msi->msi_dev);
+	kfree(msi);
+
+	return 0;
+}
+
+static int __devinit ppc4xx_msi_probe(struct platform_device *dev)
+{
+	struct ppc4xx_msi *msi;
+	struct resource res;
+	int err = 0;
+
+	msi = &ppc4xx_msi;/*keep the msi data for further use*/
+
+	dev_dbg(&dev->dev, "PCIE-MSI: Setting up MSI support...\n");
+
+	msi = kzalloc(sizeof(struct ppc4xx_msi), GFP_KERNEL);
+	if (!msi) {
+		dev_err(&dev->dev, "No memory for MSI structure\n");
+		return -ENOMEM;
+	}
+	dev->dev.platform_data = msi;
+
+	/* Get MSI ranges */
+	err = of_address_to_resource(dev->dev.of_node, 0, &res);
+	if (err) {
+		dev_err(&dev->dev, "%s resource error!\n",
+			dev->dev.of_node->full_name);
+		goto error_out;
+	}
+
+	if (ppc4xx_setup_pcieh_hw(dev, res, msi))
+		goto error_out;
+
+	err = ppc4xx_msi_init_allocator(dev, msi);
+	if (err) {
+		dev_err(&dev->dev, "Error allocating MSI bitmap\n");
+		goto error_out;
+	}
+
+	ppc_md.setup_msi_irqs = ppc4xx_setup_msi_irqs;
+	ppc_md.teardown_msi_irqs = ppc4xx_teardown_msi_irqs;
+	ppc_md.msi_check_device = ppc4xx_msi_check_device;
+	return err;
+
+error_out:
+	ppc4xx_of_msi_remove(dev);
+	return err;
+}
+static const struct of_device_id ppc4xx_msi_ids[] = {
+	{
+		.compatible = "amcc,ppc4xx-msi",
+	},
+	{}
+};
+static struct platform_driver ppc4xx_msi_driver = {
+	.probe = ppc4xx_msi_probe,
+	.remove = ppc4xx_of_msi_remove,
+	.driver = {
+		   .name = "ppc4xx-msi",
+		   .owner = THIS_MODULE,
+		   .of_match_table = ppc4xx_msi_ids,
+		   },
+
+};
+
+static __init int ppc4xx_msi_init(void)
+{
+	return platform_driver_register(&ppc4xx_msi_driver);
+}
+
+subsys_initcall(ppc4xx_msi_init);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe..d240ea9 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
        return _hypercall2(unsigned long, hvm_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, op);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 6e35ecc..0f9a84c 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -2,6 +2,7 @@ menuconfig INFINIBAND
 	tristate "InfiniBand support"
 	depends on PCI || BROKEN
 	depends on HAS_IOMEM
+	depends on NET
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index cb1ab3e..c8bbaef 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
-				device.o fmr_pool.o cache.o
+				device.o fmr_pool.o cache.o netlink.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 
 ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index f804e28..f62f52f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3639,8 +3639,16 @@ static struct kobj_type cm_port_obj_type = {
 	.release = cm_release_port_obj
 };
 
+static char *cm_devnode(struct device *dev, mode_t *mode)
+{
+	*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
 struct class cm_class = {
+	.owner   = THIS_MODULE,
 	.name    = "infiniband_cm",
+	.devnode = cm_devnode,
 };
 EXPORT_SYMBOL(cm_class);
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 99dde87..b6a33b3 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -47,6 +47,7 @@
 
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_sa.h>
@@ -89,20 +90,6 @@ struct cma_device {
 	struct list_head	id_list;
 };
 
-enum cma_state {
-	CMA_IDLE,
-	CMA_ADDR_QUERY,
-	CMA_ADDR_RESOLVED,
-	CMA_ROUTE_QUERY,
-	CMA_ROUTE_RESOLVED,
-	CMA_CONNECT,
-	CMA_DISCONNECT,
-	CMA_ADDR_BOUND,
-	CMA_LISTEN,
-	CMA_DEVICE_REMOVAL,
-	CMA_DESTROYING
-};
-
 struct rdma_bind_list {
 	struct idr		*ps;
 	struct hlist_head	owners;
@@ -126,7 +113,7 @@ struct rdma_id_private {
 	struct list_head	mc_list;
 
 	int			internal_id;
-	enum cma_state		state;
+	enum rdma_cm_state	state;
 	spinlock_t		lock;
 	struct mutex		qp_mutex;
 
@@ -146,6 +133,7 @@ struct rdma_id_private {
 	u32			seq_num;
 	u32			qkey;
 	u32			qp_num;
+	pid_t			owner;
 	u8			srq;
 	u8			tos;
 	u8			reuseaddr;
@@ -165,8 +153,8 @@ struct cma_multicast {
 struct cma_work {
 	struct work_struct	work;
 	struct rdma_id_private	*id;
-	enum cma_state		old_state;
-	enum cma_state		new_state;
+	enum rdma_cm_state	old_state;
+	enum rdma_cm_state	new_state;
 	struct rdma_cm_event	event;
 };
 
@@ -217,7 +205,7 @@ struct sdp_hah {
 #define CMA_VERSION 0x00
 #define SDP_MAJ_VERSION 0x2
 
-static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
 {
 	unsigned long flags;
 	int ret;
@@ -229,7 +217,7 @@ static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
 }
 
 static int cma_comp_exch(struct rdma_id_private *id_priv,
-			 enum cma_state comp, enum cma_state exch)
+			 enum rdma_cm_state comp, enum rdma_cm_state exch)
 {
 	unsigned long flags;
 	int ret;
@@ -241,11 +229,11 @@ static int cma_comp_exch(struct rdma_id_private *id_priv,
 	return ret;
 }
 
-static enum cma_state cma_exch(struct rdma_id_private *id_priv,
-			       enum cma_state exch)
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+				   enum rdma_cm_state exch)
 {
 	unsigned long flags;
-	enum cma_state old;
+	enum rdma_cm_state old;
 
 	spin_lock_irqsave(&id_priv->lock, flags);
 	old = id_priv->state;
@@ -279,11 +267,6 @@ static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
 	hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
 }
 
-static inline int cma_is_ud_ps(enum rdma_port_space ps)
-{
-	return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
-}
-
 static void cma_attach_to_dev(struct rdma_id_private *id_priv,
 			      struct cma_device *cma_dev)
 {
@@ -413,7 +396,7 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
 }
 
 static int cma_disable_callback(struct rdma_id_private *id_priv,
-			      enum cma_state state)
+				enum rdma_cm_state state)
 {
 	mutex_lock(&id_priv->handler_mutex);
 	if (id_priv->state != state) {
@@ -429,7 +412,8 @@ static int cma_has_cm_dev(struct rdma_id_private *id_priv)
 }
 
 struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
-				  void *context, enum rdma_port_space ps)
+				  void *context, enum rdma_port_space ps,
+				  enum ib_qp_type qp_type)
 {
 	struct rdma_id_private *id_priv;
 
@@ -437,10 +421,12 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
 	if (!id_priv)
 		return ERR_PTR(-ENOMEM);
 
-	id_priv->state = CMA_IDLE;
+	id_priv->owner = task_pid_nr(current);
+	id_priv->state = RDMA_CM_IDLE;
 	id_priv->id.context = context;
 	id_priv->id.event_handler = event_handler;
 	id_priv->id.ps = ps;
+	id_priv->id.qp_type = qp_type;
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
 	init_completion(&id_priv->comp);
@@ -508,7 +494,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
 	if (IS_ERR(qp))
 		return PTR_ERR(qp);
 
-	if (cma_is_ud_ps(id_priv->id.ps))
+	if (id->qp_type == IB_QPT_UD)
 		ret = cma_init_ud_qp(id_priv, qp);
 	else
 		ret = cma_init_conn_qp(id_priv, qp);
@@ -636,7 +622,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
 	qp_attr->port_num = id_priv->id.port_num;
 	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
 
-	if (cma_is_ud_ps(id_priv->id.ps)) {
+	if (id_priv->id.qp_type == IB_QPT_UD) {
 		ret = cma_set_qkey(id_priv);
 		if (ret)
 			return ret;
@@ -659,7 +645,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
 	id_priv = container_of(id, struct rdma_id_private, id);
 	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
 	case RDMA_TRANSPORT_IB:
-		if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+		if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
 			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
 		else
 			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
@@ -858,16 +844,16 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv)
 }
 
 static void cma_cancel_operation(struct rdma_id_private *id_priv,
-				 enum cma_state state)
+				 enum rdma_cm_state state)
 {
 	switch (state) {
-	case CMA_ADDR_QUERY:
+	case RDMA_CM_ADDR_QUERY:
 		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
 		break;
-	case CMA_ROUTE_QUERY:
+	case RDMA_CM_ROUTE_QUERY:
 		cma_cancel_route(id_priv);
 		break;
-	case CMA_LISTEN:
+	case RDMA_CM_LISTEN:
 		if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
 				&& !id_priv->cma_dev)
 			cma_cancel_listens(id_priv);
@@ -918,10 +904,10 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
 void rdma_destroy_id(struct rdma_cm_id *id)
 {
 	struct rdma_id_private *id_priv;
-	enum cma_state state;
+	enum rdma_cm_state state;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	state = cma_exch(id_priv, CMA_DESTROYING);
+	state = cma_exch(id_priv, RDMA_CM_DESTROYING);
 	cma_cancel_operation(id_priv, state);
 
 	/*
@@ -1015,9 +1001,9 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	int ret = 0;
 
 	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
-		cma_disable_callback(id_priv, CMA_CONNECT)) ||
+		cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
 	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
-		cma_disable_callback(id_priv, CMA_DISCONNECT)))
+		cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -1048,7 +1034,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		event.status = -ETIMEDOUT; /* fall through */
 	case IB_CM_DREQ_RECEIVED:
 	case IB_CM_DREP_RECEIVED:
-		if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+		if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+				   RDMA_CM_DISCONNECT))
 			goto out;
 		event.event = RDMA_CM_EVENT_DISCONNECTED;
 		break;
@@ -1075,7 +1062,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
@@ -1101,7 +1088,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 		goto err;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
-			    listen_id->ps);
+			    listen_id->ps, ib_event->param.req_rcvd.qp_type);
 	if (IS_ERR(id))
 		goto err;
 
@@ -1132,7 +1119,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	id_priv->state = CMA_CONNECT;
+	id_priv->state = RDMA_CM_CONNECT;
 	return id_priv;
 
 destroy_id:
@@ -1152,7 +1139,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 	int ret;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
-			    listen_id->ps);
+			    listen_id->ps, IB_QPT_UD);
 	if (IS_ERR(id))
 		return NULL;
 
@@ -1172,7 +1159,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 	}
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	id_priv->state = CMA_CONNECT;
+	id_priv->state = RDMA_CM_CONNECT;
 	return id_priv;
 err:
 	rdma_destroy_id(id);
@@ -1201,13 +1188,13 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	int offset, ret;
 
 	listen_id = cm_id->context;
-	if (cma_disable_callback(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
 		return -ECONNABORTED;
 
 	memset(&event, 0, sizeof event);
 	offset = cma_user_data_offset(listen_id->id.ps);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
-	if (cma_is_ud_ps(listen_id->id.ps)) {
+	if (listen_id->id.qp_type == IB_QPT_UD) {
 		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
 		event.param.ud.private_data = ib_event->private_data + offset;
 		event.param.ud.private_data_len =
@@ -1243,8 +1230,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		 * while we're accessing the cm_id.
 		 */
 		mutex_lock(&lock);
-		if (cma_comp(conn_id, CMA_CONNECT) &&
-		    !cma_is_ud_ps(conn_id->id.ps))
+		if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD))
 			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
 		mutex_unlock(&lock);
 		mutex_unlock(&conn_id->handler_mutex);
@@ -1257,7 +1243,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	conn_id->cm_id.ib = NULL;
 
 release_conn_id:
-	cma_exch(conn_id, CMA_DESTROYING);
+	cma_exch(conn_id, RDMA_CM_DESTROYING);
 	mutex_unlock(&conn_id->handler_mutex);
 	rdma_destroy_id(&conn_id->id);
 
@@ -1328,7 +1314,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 	struct sockaddr_in *sin;
 	int ret = 0;
 
-	if (cma_disable_callback(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -1371,7 +1357,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.iw = NULL;
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
@@ -1393,20 +1379,20 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	struct ib_device_attr attr;
 
 	listen_id = cm_id->context;
-	if (cma_disable_callback(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
 		return -ECONNABORTED;
 
 	/* Create a new RDMA id for the new IW CM ID */
 	new_cm_id = rdma_create_id(listen_id->id.event_handler,
 				   listen_id->id.context,
-				   RDMA_PS_TCP);
+				   RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(new_cm_id)) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
-	conn_id->state = CMA_CONNECT;
+	conn_id->state = RDMA_CM_CONNECT;
 
 	dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
 	if (!dev) {
@@ -1461,7 +1447,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	if (ret) {
 		/* User wants to destroy the CM ID */
 		conn_id->cm_id.iw = NULL;
-		cma_exch(conn_id, CMA_DESTROYING);
+		cma_exch(conn_id, RDMA_CM_DESTROYING);
 		mutex_unlock(&conn_id->handler_mutex);
 		cma_deref_id(conn_id);
 		rdma_destroy_id(&conn_id->id);
@@ -1548,13 +1534,14 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 	struct rdma_cm_id *id;
 	int ret;
 
-	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+			    id_priv->id.qp_type);
 	if (IS_ERR(id))
 		return;
 
 	dev_id_priv = container_of(id, struct rdma_id_private, id);
 
-	dev_id_priv->state = CMA_ADDR_BOUND;
+	dev_id_priv->state = RDMA_CM_ADDR_BOUND;
 	memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
 	       ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
 
@@ -1601,8 +1588,8 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
 		route->num_paths = 1;
 		*route->path_rec = *path_rec;
 	} else {
-		work->old_state = CMA_ROUTE_QUERY;
-		work->new_state = CMA_ADDR_RESOLVED;
+		work->old_state = RDMA_CM_ROUTE_QUERY;
+		work->new_state = RDMA_CM_ADDR_RESOLVED;
 		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
 		work->event.status = status;
 	}
@@ -1660,7 +1647,7 @@ static void cma_work_handler(struct work_struct *_work)
 		goto out;
 
 	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		destroy = 1;
 	}
 out:
@@ -1678,12 +1665,12 @@ static void cma_ndev_work_handler(struct work_struct *_work)
 	int destroy = 0;
 
 	mutex_lock(&id_priv->handler_mutex);
-	if (id_priv->state == CMA_DESTROYING ||
-	    id_priv->state == CMA_DEVICE_REMOVAL)
+	if (id_priv->state == RDMA_CM_DESTROYING ||
+	    id_priv->state == RDMA_CM_DEVICE_REMOVAL)
 		goto out;
 
 	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		destroy = 1;
 	}
 
@@ -1707,8 +1694,8 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
 
 	work->id = id_priv;
 	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = CMA_ROUTE_QUERY;
-	work->new_state = CMA_ROUTE_RESOLVED;
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
 	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
 
 	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
@@ -1737,7 +1724,8 @@ int rdma_set_ib_paths(struct rdma_cm_id *id,
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+			   RDMA_CM_ROUTE_RESOLVED))
 		return -EINVAL;
 
 	id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
@@ -1750,7 +1738,7 @@ int rdma_set_ib_paths(struct rdma_cm_id *id,
 	id->route.num_paths = num_paths;
 	return 0;
 err:
-	cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_set_ib_paths);
@@ -1765,8 +1753,8 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
 
 	work->id = id_priv;
 	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = CMA_ROUTE_QUERY;
-	work->new_state = CMA_ROUTE_RESOLVED;
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
 	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
 	queue_work(cma_wq, &work->work);
 	return 0;
@@ -1830,8 +1818,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 		goto err2;
 	}
 
-	work->old_state = CMA_ROUTE_QUERY;
-	work->new_state = CMA_ROUTE_RESOLVED;
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
 	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
 	work->event.status = 0;
 
@@ -1853,7 +1841,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
 		return -EINVAL;
 
 	atomic_inc(&id_priv->refcount);
@@ -1882,7 +1870,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
 
 	return 0;
 err:
-	cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
 	cma_deref_id(id_priv);
 	return ret;
 }
@@ -1941,14 +1929,16 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 
 	memset(&event, 0, sizeof event);
 	mutex_lock(&id_priv->handler_mutex);
-	if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+			   RDMA_CM_ADDR_RESOLVED))
 		goto out;
 
 	if (!status && !id_priv->cma_dev)
 		status = cma_acquire_dev(id_priv);
 
 	if (status) {
-		if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+				   RDMA_CM_ADDR_BOUND))
 			goto out;
 		event.event = RDMA_CM_EVENT_ADDR_ERROR;
 		event.status = status;
@@ -1959,7 +1949,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 	}
 
 	if (id_priv->id.event_handler(&id_priv->id, &event)) {
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
 		cma_deref_id(id_priv);
 		rdma_destroy_id(&id_priv->id);
@@ -2004,8 +1994,8 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
 
 	work->id = id_priv;
 	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = CMA_ADDR_QUERY;
-	work->new_state = CMA_ADDR_RESOLVED;
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
 	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
 	queue_work(cma_wq, &work->work);
 	return 0;
@@ -2034,13 +2024,13 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (id_priv->state == CMA_IDLE) {
+	if (id_priv->state == RDMA_CM_IDLE) {
 		ret = cma_bind_addr(id, src_addr, dst_addr);
 		if (ret)
 			return ret;
 	}
 
-	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
 		return -EINVAL;
 
 	atomic_inc(&id_priv->refcount);
@@ -2056,7 +2046,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 
 	return 0;
 err:
-	cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
 	cma_deref_id(id_priv);
 	return ret;
 }
@@ -2070,7 +2060,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 	spin_lock_irqsave(&id_priv->lock, flags);
-	if (id_priv->state == CMA_IDLE) {
+	if (id_priv->state == RDMA_CM_IDLE) {
 		id_priv->reuseaddr = reuse;
 		ret = 0;
 	} else {
@@ -2177,7 +2167,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
 		if (id_priv == cur_id)
 			continue;
 
-		if ((cur_id->state == CMA_LISTEN) ||
+		if ((cur_id->state == RDMA_CM_LISTEN) ||
 		    !reuseaddr || !cur_id->reuseaddr) {
 			cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
 			if (cma_any_addr(cur_addr))
@@ -2280,14 +2270,14 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (id_priv->state == CMA_IDLE) {
+	if (id_priv->state == RDMA_CM_IDLE) {
 		((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
 		ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
 		if (ret)
 			return ret;
 	}
 
-	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
 		return -EINVAL;
 
 	if (id_priv->reuseaddr) {
@@ -2319,7 +2309,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
 	return 0;
 err:
 	id_priv->backlog = 0;
-	cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+	cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_listen);
@@ -2333,7 +2323,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
 		return -EAFNOSUPPORT;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+	if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
 		return -EINVAL;
 
 	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
@@ -2360,7 +2350,7 @@ err2:
 	if (id_priv->cma_dev)
 		cma_release_dev(id_priv);
 err1:
-	cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_bind_addr);
@@ -2433,7 +2423,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
 	int ret = 0;
 
-	if (cma_disable_callback(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -2479,7 +2469,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 	if (ret) {
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
@@ -2645,7 +2635,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
 		return -EINVAL;
 
 	if (!id->qp) {
@@ -2655,7 +2645,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 
 	switch (rdma_node_get_transport(id->device->node_type)) {
 	case RDMA_TRANSPORT_IB:
-		if (cma_is_ud_ps(id->ps))
+		if (id->qp_type == IB_QPT_UD)
 			ret = cma_resolve_ib_udp(id_priv, conn_param);
 		else
 			ret = cma_connect_ib(id_priv, conn_param);
@@ -2672,7 +2662,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 
 	return 0;
 err:
-	cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+	cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_connect);
@@ -2758,7 +2748,10 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp(id_priv, CMA_CONNECT))
+
+	id_priv->owner = task_pid_nr(current);
+
+	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
 		return -EINVAL;
 
 	if (!id->qp && conn_param) {
@@ -2768,7 +2761,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 
 	switch (rdma_node_get_transport(id->device->node_type)) {
 	case RDMA_TRANSPORT_IB:
-		if (cma_is_ud_ps(id->ps))
+		if (id->qp_type == IB_QPT_UD)
 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
 						conn_param->private_data,
 						conn_param->private_data_len);
@@ -2829,7 +2822,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
 
 	switch (rdma_node_get_transport(id->device->node_type)) {
 	case RDMA_TRANSPORT_IB:
-		if (cma_is_ud_ps(id->ps))
+		if (id->qp_type == IB_QPT_UD)
 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
 						private_data, private_data_len);
 		else
@@ -2887,8 +2880,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	int ret;
 
 	id_priv = mc->id_priv;
-	if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
-	    cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
+	if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
 		return 0;
 
 	mutex_lock(&id_priv->qp_mutex);
@@ -2912,7 +2905,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 
 	ret = id_priv->id.event_handler(&id_priv->id, &event);
 	if (ret) {
-		cma_exch(id_priv, CMA_DESTROYING);
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return 0;
@@ -3095,8 +3088,8 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
-	    !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+	if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
 		return -EINVAL;
 
 	mc = kmalloc(sizeof *mc, GFP_KERNEL);
@@ -3261,19 +3254,19 @@ static void cma_add_one(struct ib_device *device)
 static int cma_remove_id_dev(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event;
-	enum cma_state state;
+	enum rdma_cm_state state;
 	int ret = 0;
 
 	/* Record that we want to remove the device */
-	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
-	if (state == CMA_DESTROYING)
+	state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+	if (state == RDMA_CM_DESTROYING)
 		return 0;
 
 	cma_cancel_operation(id_priv, state);
 	mutex_lock(&id_priv->handler_mutex);
 
 	/* Check for destruction from another callback. */
-	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+	if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
 		goto out;
 
 	memset(&event, 0, sizeof event);
@@ -3328,6 +3321,100 @@ static void cma_remove_one(struct ib_device *device)
 	kfree(cma_dev);
 }
 
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	struct rdma_cm_id_stats *id_stats;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id = NULL;
+	struct cma_device *cma_dev;
+	int i_dev = 0, i_id = 0;
+
+	/*
+	 * We export all of the IDs as a sequence of messages.  Each
+	 * ID gets its own netlink message.
+	 */
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		if (i_dev < cb->args[0]) {
+			i_dev++;
+			continue;
+		}
+
+		i_id = 0;
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			if (i_id < cb->args[1]) {
+				i_id++;
+				continue;
+			}
+
+			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+						sizeof *id_stats, RDMA_NL_RDMA_CM,
+						RDMA_NL_RDMA_CM_ID_STATS);
+			if (!id_stats)
+				goto out;
+
+			memset(id_stats, 0, sizeof *id_stats);
+			id = &id_priv->id;
+			id_stats->node_type = id->route.addr.dev_addr.dev_type;
+			id_stats->port_num = id->port_num;
+			id_stats->bound_dev_if =
+				id->route.addr.dev_addr.bound_dev_if;
+
+			if (id->route.addr.src_addr.ss_family == AF_INET) {
+				if (ibnl_put_attr(skb, nlh,
+						  sizeof(struct sockaddr_in),
+						  &id->route.addr.src_addr,
+						  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) {
+					goto out;
+				}
+				if (ibnl_put_attr(skb, nlh,
+						  sizeof(struct sockaddr_in),
+						  &id->route.addr.dst_addr,
+						  RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) {
+					goto out;
+				}
+			} else if (id->route.addr.src_addr.ss_family == AF_INET6) {
+				if (ibnl_put_attr(skb, nlh,
+						  sizeof(struct sockaddr_in6),
+						  &id->route.addr.src_addr,
+						  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) {
+					goto out;
+				}
+				if (ibnl_put_attr(skb, nlh,
+						  sizeof(struct sockaddr_in6),
+						  &id->route.addr.dst_addr,
+						  RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) {
+					goto out;
+				}
+			}
+
+			id_stats->pid		= id_priv->owner;
+			id_stats->port_space	= id->ps;
+			id_stats->cm_state	= id_priv->state;
+			id_stats->qp_num	= id_priv->qp_num;
+			id_stats->qp_type	= id->qp_type;
+
+			i_id++;
+		}
+
+		cb->args[1] = 0;
+		i_dev++;
+	}
+
+out:
+	mutex_unlock(&lock);
+	cb->args[0] = i_dev;
+	cb->args[1] = i_id;
+
+	return skb->len;
+}
+
+static const struct ibnl_client_cbs cma_cb_table[] = {
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats },
+};
+
 static int __init cma_init(void)
 {
 	int ret;
@@ -3343,6 +3430,10 @@ static int __init cma_init(void)
 	ret = ib_register_client(&cma_client);
 	if (ret)
 		goto err;
+
+	if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+		printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+
 	return 0;
 
 err:
@@ -3355,6 +3446,7 @@ err:
 
 static void __exit cma_cleanup(void)
 {
+	ibnl_remove_client(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f793bf2..4007f72 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 
@@ -725,22 +726,40 @@ static int __init ib_core_init(void)
 		return -ENOMEM;
 
 	ret = ib_sysfs_setup();
-	if (ret)
+	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+		goto err;
+	}
+
+	ret = ibnl_init();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't init IB netlink interface\n");
+		goto err_sysfs;
+	}
 
 	ret = ib_cache_setup();
 	if (ret) {
 		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
-		ib_sysfs_cleanup();
-		destroy_workqueue(ib_wq);
+		goto err_nl;
 	}
 
+	return 0;
+
+err_nl:
+	ibnl_cleanup();
+
+err_sysfs:
+	ib_sysfs_cleanup();
+
+err:
+	destroy_workqueue(ib_wq);
 	return ret;
 }
 
 static void __exit ib_core_cleanup(void)
 {
 	ib_cache_cleanup();
+	ibnl_cleanup();
 	ib_sysfs_cleanup();
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 822cfdc..b4d8672 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -276,6 +276,13 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 		goto error1;
 	}
 
+	/* Verify the QP requested is supported.  For example, Ethernet devices
+	 * will not have QP0 */
+	if (!port_priv->qp_info[qpn].qp) {
+		ret = ERR_PTR(-EPROTONOSUPPORT);
+		goto error1;
+	}
+
 	/* Allocate structures */
 	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
 	if (!mad_agent_priv) {
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 0000000..4a5abaf
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+
+struct ibnl_client {
+	struct list_head		list;
+	int				index;
+	int				nops;
+	const struct ibnl_client_cbs   *cb_table;
+};
+
+static DEFINE_MUTEX(ibnl_mutex);
+static struct sock *nls;
+static LIST_HEAD(client_list);
+
+int ibnl_add_client(int index, int nops,
+		    const struct ibnl_client_cbs cb_table[])
+{
+	struct ibnl_client *cur;
+	struct ibnl_client *nl_client;
+
+	nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
+	if (!nl_client)
+		return -ENOMEM;
+
+	nl_client->index	= index;
+	nl_client->nops		= nops;
+	nl_client->cb_table	= cb_table;
+
+	mutex_lock(&ibnl_mutex);
+
+	list_for_each_entry(cur, &client_list, list) {
+		if (cur->index == index) {
+			pr_warn("Client for %d already exists\n", index);
+			mutex_unlock(&ibnl_mutex);
+			kfree(nl_client);
+			return -EINVAL;
+		}
+	}
+
+	list_add_tail(&nl_client->list, &client_list);
+
+	mutex_unlock(&ibnl_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_add_client);
+
+int ibnl_remove_client(int index)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		if (cur->index == index) {
+			list_del(&(cur->list));
+			mutex_unlock(&ibnl_mutex);
+			kfree(cur);
+			return 0;
+		}
+	}
+	pr_warn("Can't remove callback for client idx %d. Not found\n", index);
+	mutex_unlock(&ibnl_mutex);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ibnl_remove_client);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	*nlh = NLMSG_NEW(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
+			len, NLM_F_MULTI);
+	(*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
+	return NLMSG_DATA(*nlh);
+
+nlmsg_failure:
+	nlmsg_trim(skb, prev_tail);
+	return NULL;
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	NLA_PUT(skb, type, len, data);
+	nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail;
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, prev_tail - nlh->nlmsg_len);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct ibnl_client *client;
+	int type = nlh->nlmsg_type;
+	int index = RDMA_NL_GET_CLIENT(type);
+	int op = RDMA_NL_GET_OP(type);
+
+	list_for_each_entry(client, &client_list, list) {
+		if (client->index == index) {
+			if (op < 0 || op >= client->nops ||
+			    !client->cb_table[RDMA_NL_GET_OP(op)].dump)
+				return -EINVAL;
+			return netlink_dump_start(nls, skb, nlh,
+						  client->cb_table[op].dump,
+						  NULL);
+		}
+	}
+
+	pr_info("Index %d wasn't found in client list\n", index);
+	return -EINVAL;
+}
+
+static void ibnl_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&ibnl_mutex);
+	netlink_rcv_skb(skb, &ibnl_rcv_msg);
+	mutex_unlock(&ibnl_mutex);
+}
+
+int __init ibnl_init(void)
+{
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv,
+				    NULL, THIS_MODULE);
+	if (!nls) {
+		pr_warn("Failed to create netlink socket\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void ibnl_cleanup(void)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		list_del(&(cur->list));
+		kfree(cur);
+	}
+	mutex_unlock(&ibnl_mutex);
+
+	netlink_kernel_release(nls);
+}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index b3fa798..71be5ee 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -367,13 +367,28 @@ done:
 	return ret;
 }
 
-static ssize_t ucma_create_id(struct ucma_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+	switch (cmd->ps) {
+	case RDMA_PS_TCP:
+		*qp_type = IB_QPT_RC;
+		return 0;
+	case RDMA_PS_UDP:
+	case RDMA_PS_IPOIB:
+		*qp_type = IB_QPT_UD;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
 {
 	struct rdma_ucm_create_id cmd;
 	struct rdma_ucm_create_id_resp resp;
 	struct ucma_context *ctx;
+	enum ib_qp_type qp_type;
 	int ret;
 
 	if (out_len < sizeof(resp))
@@ -382,6 +397,10 @@ static ssize_t ucma_create_id(struct ucma_file *file,
 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
 		return -EFAULT;
 
+	ret = ucma_get_qp_type(&cmd, &qp_type);
+	if (ret)
+		return ret;
+
 	mutex_lock(&file->mut);
 	ctx = ucma_alloc_ctx(file);
 	mutex_unlock(&file->mut);
@@ -389,7 +408,7 @@ static ssize_t ucma_create_id(struct ucma_file *file,
 		return -ENOMEM;
 
 	ctx->uid = cmd.uid;
-	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
+	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
 	if (IS_ERR(ctx->cm_id)) {
 		ret = PTR_ERR(ctx->cm_id);
 		goto err1;
@@ -1338,9 +1357,11 @@ static const struct file_operations ucma_fops = {
 };
 
 static struct miscdevice ucma_misc = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= "rdma_cm",
-	.fops	= &ucma_fops,
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "rdma_cm",
+	.nodename	= "infiniband/rdma_cm",
+	.mode		= 0666,
+	.fops		= &ucma_fops,
 };
 
 static ssize_t show_abi_version(struct device *dev,
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index cd1996d..8d261b6 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1176,6 +1176,11 @@ static void ib_umad_remove_one(struct ib_device *device)
 	kref_put(&umad_dev->ref, ib_umad_release_dev);
 }
 
+static char *umad_devnode(struct device *dev, mode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
 static int __init ib_umad_init(void)
 {
 	int ret;
@@ -1194,6 +1199,8 @@ static int __init ib_umad_init(void)
 		goto out_chrdev;
 	}
 
+	umad_class->devnode = umad_devnode;
+
 	ret = class_create_file(umad_class, &class_attr_abi_version.attr);
 	if (ret) {
 		printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index ec83e9f..e49a85f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -824,6 +824,12 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	kfree(uverbs_dev);
 }
 
+static char *uverbs_devnode(struct device *dev, mode_t *mode)
+{
+	*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
 static int __init ib_uverbs_init(void)
 {
 	int ret;
@@ -842,6 +848,8 @@ static int __init ib_uverbs_init(void)
 		goto out_chrdev;
 	}
 
+	uverbs_class->devnode = uverbs_devnode;
+
 	ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
 	if (ret) {
 		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 2391841..0a5008f 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -914,7 +914,7 @@ static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
 		goto err;
 
 	if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
-		iwch_post_zb_read(ep->com.qp);
+		iwch_post_zb_read(ep);
 	}
 
 	goto out;
@@ -1078,6 +1078,8 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	struct iwch_ep *ep = ctx;
 	struct cpl_wr_ack *hdr = cplhdr(skb);
 	unsigned int credits = ntohs(hdr->credits);
+	unsigned long flags;
+	int post_zb = 0;
 
 	PDBG("%s ep %p credits %u\n", __func__, ep, credits);
 
@@ -1087,28 +1089,34 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 		return CPL_RET_BUF_DONE;
 	}
 
+	spin_lock_irqsave(&ep->com.lock, flags);
 	BUG_ON(credits != 1);
 	dst_confirm(ep->dst);
 	if (!ep->mpa_skb) {
 		PDBG("%s rdma_init wr_ack ep %p state %u\n",
-			__func__, ep, state_read(&ep->com));
+			__func__, ep, ep->com.state);
 		if (ep->mpa_attr.initiator) {
 			PDBG("%s initiator ep %p state %u\n",
-				__func__, ep, state_read(&ep->com));
-			if (peer2peer)
-				iwch_post_zb_read(ep->com.qp);
+				__func__, ep, ep->com.state);
+			if (peer2peer && ep->com.state == FPDU_MODE)
+				post_zb = 1;
 		} else {
 			PDBG("%s responder ep %p state %u\n",
-				__func__, ep, state_read(&ep->com));
-			ep->com.rpl_done = 1;
-			wake_up(&ep->com.waitq);
+				__func__, ep, ep->com.state);
+			if (ep->com.state == MPA_REQ_RCVD) {
+				ep->com.rpl_done = 1;
+				wake_up(&ep->com.waitq);
+			}
 		}
 	} else {
 		PDBG("%s lsm ack ep %p state %u freeing skb\n",
-			__func__, ep, state_read(&ep->com));
+			__func__, ep, ep->com.state);
 		kfree_skb(ep->mpa_skb);
 		ep->mpa_skb = NULL;
 	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (post_zb)
+		iwch_post_zb_read(ep);
 	return CPL_RET_BUF_DONE;
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index c5406da..9a342c9 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -332,7 +332,7 @@ int iwch_bind_mw(struct ib_qp *qp,
 			     struct ib_mw_bind *mw_bind);
 int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
-int iwch_post_zb_read(struct iwch_qp *qhp);
+int iwch_post_zb_read(struct iwch_ep *ep);
 int iwch_register_device(struct iwch_dev *dev);
 void iwch_unregister_device(struct iwch_dev *dev);
 void stop_read_rep_timer(struct iwch_qp *qhp);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 1b4cd09..ecd313f 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -738,7 +738,7 @@ static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
 	}
 }
 
-int iwch_post_zb_read(struct iwch_qp *qhp)
+int iwch_post_zb_read(struct iwch_ep *ep)
 {
 	union t3_wr *wqe;
 	struct sk_buff *skb;
@@ -761,10 +761,10 @@ int iwch_post_zb_read(struct iwch_qp *qhp)
 	wqe->read.local_len = cpu_to_be32(0);
 	wqe->read.local_to = cpu_to_be64(1);
 	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ));
-	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)|
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)|
 						V_FW_RIWR_LEN(flit_cnt));
 	skb->priority = CPL_PRIORITY_DATA;
-	return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
+	return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb);
 }
 
 /*
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 35d2a5d..4f04537 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -35,7 +35,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
-#include <linux/workqueue.h>
+#include <linux/completion.h>
 #include <linux/netdevice.h>
 #include <linux/sched.h>
 #include <linux/pci.h>
@@ -131,28 +131,21 @@ static inline int c4iw_num_stags(struct c4iw_rdev *rdev)
 
 #define C4IW_WR_TO (10*HZ)
 
-enum {
-	REPLY_READY = 0,
-};
-
 struct c4iw_wr_wait {
-	wait_queue_head_t wait;
-	unsigned long status;
+	struct completion completion;
 	int ret;
 };
 
 static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp)
 {
 	wr_waitp->ret = 0;
-	wr_waitp->status = 0;
-	init_waitqueue_head(&wr_waitp->wait);
+	init_completion(&wr_waitp->completion);
 }
 
 static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret)
 {
 	wr_waitp->ret = ret;
-	set_bit(REPLY_READY, &wr_waitp->status);
-	wake_up(&wr_waitp->wait);
+	complete(&wr_waitp->completion);
 }
 
 static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
@@ -164,8 +157,7 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
 	int ret;
 
 	do {
-		ret = wait_event_timeout(wr_waitp->wait,
-			test_and_clear_bit(REPLY_READY, &wr_waitp->status), to);
+		ret = wait_for_completion_timeout(&wr_waitp->completion, to);
 		if (!ret) {
 			printk(KERN_ERR MOD "%s - Device %s not responding - "
 			       "tid %u qpid %u\n", func,
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index 13de119..2d668c6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -1138,7 +1138,9 @@ static ssize_t nes_store_wqm_quanta(struct device_driver *ddp,
 	u32 i = 0;
 	struct nes_device *nesdev;
 
-	strict_strtoul(buf, 0, &wqm_quanta_value);
+	if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
+		return -EINVAL;
+
 	list_for_each_entry(nesdev, &nes_dev_list, list) {
 		if (i == ee_flsh_adapter) {
 			nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
index 7c03a70..8349f9c 100644
--- a/drivers/infiniband/hw/qib/Kconfig
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_QIB
 	tristate "QLogic PCIe HCA support"
-	depends on 64BIT && NET
+	depends on 64BIT
 	---help---
 	This is a low-level driver for QLogic PCIe QLE InfiniBand host
 	channel adapters.  This driver does not support the QLogic
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 98768657..ede1475 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -548,7 +548,7 @@ int iser_connect(struct iser_conn   *ib_conn,
 	iser_conn_get(ib_conn); /* ref ib conn's cma id */
 	ib_conn->cma_id = rdma_create_id(iser_cma_handler,
 					     (void *)ib_conn,
-					     RDMA_PS_TCP);
+					     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(ib_conn->cma_id)) {
 		err = PTR_ERR(ib_conn->cma_id);
 		iser_err("rdma_create_id failed: %d\n", err);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 376d640..ee165fd 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1147,7 +1147,7 @@ static void srp_process_aer_req(struct srp_target_port *target,
 static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 {
 	struct ib_device *dev = target->srp_host->srp_dev->dev;
-	struct srp_iu *iu = (struct srp_iu *) wc->wr_id;
+	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
 	int res;
 	u8 opcode;
 
@@ -1231,7 +1231,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
 			break;
 		}
 
-		iu = (struct srp_iu *) wc.wr_id;
+		iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
 		list_add(&iu->list, &target->free_tx);
 	}
 }
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index fc14b8d..fbd96b2 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -271,8 +271,8 @@ config SPI_ORION
 	  This enables using the SPI master controller on the Orion chips.
 
 config SPI_PL022
-	tristate "ARM AMBA PL022 SSP controller (EXPERIMENTAL)"
-	depends on ARM_AMBA && EXPERIMENTAL
+	tristate "ARM AMBA PL022 SSP controller"
+	depends on ARM_AMBA
 	default y if MACH_U300
 	default y if ARCH_REALVIEW
 	default y if INTEGRATOR_IMPD1
diff --git a/drivers/spi/amba-pl022.c b/drivers/spi/amba-pl022.c
index 08de58e..6a9e58d 100644
--- a/drivers/spi/amba-pl022.c
+++ b/drivers/spi/amba-pl022.c
@@ -24,11 +24,6 @@
  * GNU General Public License for more details.
  */
 
-/*
- * TODO:
- * - add timeout on polled transfers
- */
-
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/device.h>
@@ -287,6 +282,8 @@
 
 #define CLEAR_ALL_INTERRUPTS  0x3
 
+#define SPI_POLLING_TIMEOUT 1000
+
 
 /*
  * The type of reading going on on this chip
@@ -1063,7 +1060,7 @@ static int __init pl022_dma_probe(struct pl022 *pl022)
 					    pl022->master_info->dma_filter,
 					    pl022->master_info->dma_rx_param);
 	if (!pl022->dma_rx_channel) {
-		dev_err(&pl022->adev->dev, "no RX DMA channel!\n");
+		dev_dbg(&pl022->adev->dev, "no RX DMA channel!\n");
 		goto err_no_rxchan;
 	}
 
@@ -1071,13 +1068,13 @@ static int __init pl022_dma_probe(struct pl022 *pl022)
 					    pl022->master_info->dma_filter,
 					    pl022->master_info->dma_tx_param);
 	if (!pl022->dma_tx_channel) {
-		dev_err(&pl022->adev->dev, "no TX DMA channel!\n");
+		dev_dbg(&pl022->adev->dev, "no TX DMA channel!\n");
 		goto err_no_txchan;
 	}
 
 	pl022->dummypage = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!pl022->dummypage) {
-		dev_err(&pl022->adev->dev, "no DMA dummypage!\n");
+		dev_dbg(&pl022->adev->dev, "no DMA dummypage!\n");
 		goto err_no_dummypage;
 	}
 
@@ -1093,6 +1090,8 @@ err_no_txchan:
 	dma_release_channel(pl022->dma_rx_channel);
 	pl022->dma_rx_channel = NULL;
 err_no_rxchan:
+	dev_err(&pl022->adev->dev,
+			"Failed to work in dma mode, work without dma!\n");
 	return -ENODEV;
 }
 
@@ -1378,6 +1377,7 @@ static void do_polling_transfer(struct pl022 *pl022)
 	struct spi_transfer *transfer = NULL;
 	struct spi_transfer *previous = NULL;
 	struct chip_data *chip;
+	unsigned long time, timeout;
 
 	chip = pl022->cur_chip;
 	message = pl022->cur_msg;
@@ -1415,9 +1415,19 @@ static void do_polling_transfer(struct pl022 *pl022)
 		       SSP_CR1(pl022->virtbase));
 
 		dev_dbg(&pl022->adev->dev, "polling transfer ongoing ...\n");
-		/* FIXME: insert a timeout so we don't hang here indefinitely */
-		while (pl022->tx < pl022->tx_end || pl022->rx < pl022->rx_end)
+
+		timeout = jiffies + msecs_to_jiffies(SPI_POLLING_TIMEOUT);
+		while (pl022->tx < pl022->tx_end || pl022->rx < pl022->rx_end) {
+			time = jiffies;
 			readwriter(pl022);
+			if (time_after(time, timeout)) {
+				dev_warn(&pl022->adev->dev,
+				"%s: timeout!\n", __func__);
+				message->state = STATE_ERROR;
+				goto out;
+			}
+			cpu_relax();
+		}
 
 		/* Update total byte transferred */
 		message->actual_length += pl022->cur_transfer->len;
@@ -1426,7 +1436,7 @@ static void do_polling_transfer(struct pl022 *pl022)
 		/* Move to next transfer */
 		message->state = next_transfer(pl022);
 	}
-
+out:
 	/* Handle end of message */
 	if (message->state == STATE_DONE)
 		message->status = 0;
@@ -2107,7 +2117,7 @@ pl022_probe(struct amba_device *adev, const struct amba_id *id)
 	if (platform_info->enable_dma) {
 		status = pl022_dma_probe(pl022);
 		if (status != 0)
-			goto err_no_dma;
+			platform_info->enable_dma = 0;
 	}
 
 	/* Initialize and start queue */
@@ -2143,7 +2153,6 @@ pl022_probe(struct amba_device *adev, const struct amba_id *id)
  err_init_queue:
 	destroy_queue(pl022);
 	pl022_dma_remove(pl022);
- err_no_dma:
 	free_irq(adev->irq[0], pl022);
  err_no_irq:
 	clk_put(pl022->clk);
diff --git a/drivers/spi/dw_spi.c b/drivers/spi/dw_spi.c
index 871e337..919fa9d 100644
--- a/drivers/spi/dw_spi.c
+++ b/drivers/spi/dw_spi.c
@@ -58,8 +58,6 @@ struct chip_data {
 	u8 bits_per_word;
 	u16 clk_div;		/* baud rate divider */
 	u32 speed_hz;		/* baud rate */
-	int (*write)(struct dw_spi *dws);
-	int (*read)(struct dw_spi *dws);
 	void (*cs_control)(u32 command);
 };
 
@@ -162,107 +160,70 @@ static inline void mrst_spi_debugfs_remove(struct dw_spi *dws)
 }
 #endif /* CONFIG_DEBUG_FS */
 
-static void wait_till_not_busy(struct dw_spi *dws)
+/* Return the max entries we can fill into tx fifo */
+static inline u32 tx_max(struct dw_spi *dws)
 {
-	unsigned long end = jiffies + 1 + usecs_to_jiffies(5000);
+	u32 tx_left, tx_room, rxtx_gap;
 
-	while (time_before(jiffies, end)) {
-		if (!(dw_readw(dws, sr) & SR_BUSY))
-			return;
-		cpu_relax();
-	}
-	dev_err(&dws->master->dev,
-		"DW SPI: Status keeps busy for 5000us after a read/write!\n");
-}
-
-static void flush(struct dw_spi *dws)
-{
-	while (dw_readw(dws, sr) & SR_RF_NOT_EMPT) {
-		dw_readw(dws, dr);
-		cpu_relax();
-	}
-
-	wait_till_not_busy(dws);
-}
-
-static int null_writer(struct dw_spi *dws)
-{
-	u8 n_bytes = dws->n_bytes;
+	tx_left = (dws->tx_end - dws->tx) / dws->n_bytes;
+	tx_room = dws->fifo_len - dw_readw(dws, txflr);
 
-	if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL)
-		|| (dws->tx == dws->tx_end))
-		return 0;
-	dw_writew(dws, dr, 0);
-	dws->tx += n_bytes;
+	/*
+	 * Another concern is about the tx/rx mismatch, we
+	 * though to use (dws->fifo_len - rxflr - txflr) as
+	 * one maximum value for tx, but it doesn't cover the
+	 * data which is out of tx/rx fifo and inside the
+	 * shift registers. So a control from sw point of
+	 * view is taken.
+	 */
+	rxtx_gap =  ((dws->rx_end - dws->rx) - (dws->tx_end - dws->tx))
+			/ dws->n_bytes;
 
-	wait_till_not_busy(dws);
-	return 1;
+	return min3(tx_left, tx_room, (u32) (dws->fifo_len - rxtx_gap));
 }
 
-static int null_reader(struct dw_spi *dws)
+/* Return the max entries we should read out of rx fifo */
+static inline u32 rx_max(struct dw_spi *dws)
 {
-	u8 n_bytes = dws->n_bytes;
+	u32 rx_left = (dws->rx_end - dws->rx) / dws->n_bytes;
 
-	while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT)
-		&& (dws->rx < dws->rx_end)) {
-		dw_readw(dws, dr);
-		dws->rx += n_bytes;
-	}
-	wait_till_not_busy(dws);
-	return dws->rx == dws->rx_end;
+	return min(rx_left, (u32)dw_readw(dws, rxflr));
 }
 
-static int u8_writer(struct dw_spi *dws)
+static void dw_writer(struct dw_spi *dws)
 {
-	if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL)
-		|| (dws->tx == dws->tx_end))
-		return 0;
+	u32 max = tx_max(dws);
+	u16 txw = 0;
 
-	dw_writew(dws, dr, *(u8 *)(dws->tx));
-	++dws->tx;
-
-	wait_till_not_busy(dws);
-	return 1;
-}
-
-static int u8_reader(struct dw_spi *dws)
-{
-	while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT)
-		&& (dws->rx < dws->rx_end)) {
-		*(u8 *)(dws->rx) = dw_readw(dws, dr);
-		++dws->rx;
+	while (max--) {
+		/* Set the tx word if the transfer's original "tx" is not null */
+		if (dws->tx_end - dws->len) {
+			if (dws->n_bytes == 1)
+				txw = *(u8 *)(dws->tx);
+			else
+				txw = *(u16 *)(dws->tx);
+		}
+		dw_writew(dws, dr, txw);
+		dws->tx += dws->n_bytes;
 	}
-
-	wait_till_not_busy(dws);
-	return dws->rx == dws->rx_end;
 }
 
-static int u16_writer(struct dw_spi *dws)
+static void dw_reader(struct dw_spi *dws)
 {
-	if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL)
-		|| (dws->tx == dws->tx_end))
-		return 0;
+	u32 max = rx_max(dws);
+	u16 rxw;
 
-	dw_writew(dws, dr, *(u16 *)(dws->tx));
-	dws->tx += 2;
-
-	wait_till_not_busy(dws);
-	return 1;
-}
-
-static int u16_reader(struct dw_spi *dws)
-{
-	u16 temp;
-
-	while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT)
-		&& (dws->rx < dws->rx_end)) {
-		temp = dw_readw(dws, dr);
-		*(u16 *)(dws->rx) = temp;
-		dws->rx += 2;
+	while (max--) {
+		rxw = dw_readw(dws, dr);
+		/* Care rx only if the transfer's original "rx" is not null */
+		if (dws->rx_end - dws->len) {
+			if (dws->n_bytes == 1)
+				*(u8 *)(dws->rx) = rxw;
+			else
+				*(u16 *)(dws->rx) = rxw;
+		}
+		dws->rx += dws->n_bytes;
 	}
-
-	wait_till_not_busy(dws);
-	return dws->rx == dws->rx_end;
 }
 
 static void *next_transfer(struct dw_spi *dws)
@@ -334,8 +295,7 @@ static void giveback(struct dw_spi *dws)
 
 static void int_error_stop(struct dw_spi *dws, const char *msg)
 {
-	/* Stop and reset hw */
-	flush(dws);
+	/* Stop the hw */
 	spi_enable_chip(dws, 0);
 
 	dev_err(&dws->master->dev, "%s\n", msg);
@@ -362,35 +322,28 @@ EXPORT_SYMBOL_GPL(dw_spi_xfer_done);
 
 static irqreturn_t interrupt_transfer(struct dw_spi *dws)
 {
-	u16 irq_status, irq_mask = 0x3f;
-	u32 int_level = dws->fifo_len / 2;
-	u32 left;
+	u16 irq_status = dw_readw(dws, isr);
 
-	irq_status = dw_readw(dws, isr) & irq_mask;
 	/* Error handling */
 	if (irq_status & (SPI_INT_TXOI | SPI_INT_RXOI | SPI_INT_RXUI)) {
 		dw_readw(dws, txoicr);
 		dw_readw(dws, rxoicr);
 		dw_readw(dws, rxuicr);
-		int_error_stop(dws, "interrupt_transfer: fifo overrun");
+		int_error_stop(dws, "interrupt_transfer: fifo overrun/underrun");
 		return IRQ_HANDLED;
 	}
 
+	dw_reader(dws);
+	if (dws->rx_end == dws->rx) {
+		spi_mask_intr(dws, SPI_INT_TXEI);
+		dw_spi_xfer_done(dws);
+		return IRQ_HANDLED;
+	}
 	if (irq_status & SPI_INT_TXEI) {
 		spi_mask_intr(dws, SPI_INT_TXEI);
-
-		left = (dws->tx_end - dws->tx) / dws->n_bytes;
-		left = (left > int_level) ? int_level : left;
-
-		while (left--)
-			dws->write(dws);
-		dws->read(dws);
-
-		/* Re-enable the IRQ if there is still data left to tx */
-		if (dws->tx_end > dws->tx)
-			spi_umask_intr(dws, SPI_INT_TXEI);
-		else
-			dw_spi_xfer_done(dws);
+		dw_writer(dws);
+		/* Enable TX irq always, it will be disabled when RX finished */
+		spi_umask_intr(dws, SPI_INT_TXEI);
 	}
 
 	return IRQ_HANDLED;
@@ -399,15 +352,13 @@ static irqreturn_t interrupt_transfer(struct dw_spi *dws)
 static irqreturn_t dw_spi_irq(int irq, void *dev_id)
 {
 	struct dw_spi *dws = dev_id;
-	u16 irq_status, irq_mask = 0x3f;
+	u16 irq_status = dw_readw(dws, isr) & 0x3f;
 
-	irq_status = dw_readw(dws, isr) & irq_mask;
 	if (!irq_status)
 		return IRQ_NONE;
 
 	if (!dws->cur_msg) {
 		spi_mask_intr(dws, SPI_INT_TXEI);
-		/* Never fail */
 		return IRQ_HANDLED;
 	}
 
@@ -417,13 +368,11 @@ static irqreturn_t dw_spi_irq(int irq, void *dev_id)
 /* Must be called inside pump_transfers() */
 static void poll_transfer(struct dw_spi *dws)
 {
-	while (dws->write(dws))
-		dws->read(dws);
-	/*
-	 * There is a possibility that the last word of a transaction
-	 * will be lost if data is not ready. Re-read to solve this issue.
-	 */
-	dws->read(dws);
+	do {
+		dw_writer(dws);
+		dw_reader(dws);
+		cpu_relax();
+	} while (dws->rx_end > dws->rx);
 
 	dw_spi_xfer_done(dws);
 }
@@ -483,8 +432,6 @@ static void pump_transfers(unsigned long data)
 	dws->tx_end = dws->tx + transfer->len;
 	dws->rx = transfer->rx_buf;
 	dws->rx_end = dws->rx + transfer->len;
-	dws->write = dws->tx ? chip->write : null_writer;
-	dws->read = dws->rx ? chip->read : null_reader;
 	dws->cs_change = transfer->cs_change;
 	dws->len = dws->cur_transfer->len;
 	if (chip != dws->prev_chip)
@@ -518,20 +465,8 @@ static void pump_transfers(unsigned long data)
 
 		switch (bits) {
 		case 8:
-			dws->n_bytes = 1;
-			dws->dma_width = 1;
-			dws->read = (dws->read != null_reader) ?
-					u8_reader : null_reader;
-			dws->write = (dws->write != null_writer) ?
-					u8_writer : null_writer;
-			break;
 		case 16:
-			dws->n_bytes = 2;
-			dws->dma_width = 2;
-			dws->read = (dws->read != null_reader) ?
-					u16_reader : null_reader;
-			dws->write = (dws->write != null_writer) ?
-					u16_writer : null_writer;
+			dws->n_bytes = dws->dma_width = bits >> 3;
 			break;
 		default:
 			printk(KERN_ERR "MRST SPI0: unsupported bits:"
@@ -575,7 +510,7 @@ static void pump_transfers(unsigned long data)
 		txint_level = dws->fifo_len / 2;
 		txint_level = (templen > txint_level) ? txint_level : templen;
 
-		imask |= SPI_INT_TXEI;
+		imask |= SPI_INT_TXEI | SPI_INT_TXOI | SPI_INT_RXUI | SPI_INT_RXOI;
 		dws->transfer_handler = interrupt_transfer;
 	}
 
@@ -733,13 +668,9 @@ static int dw_spi_setup(struct spi_device *spi)
 	if (spi->bits_per_word <= 8) {
 		chip->n_bytes = 1;
 		chip->dma_width = 1;
-		chip->read = u8_reader;
-		chip->write = u8_writer;
 	} else if (spi->bits_per_word <= 16) {
 		chip->n_bytes = 2;
 		chip->dma_width = 2;
-		chip->read = u16_reader;
-		chip->write = u16_writer;
 	} else {
 		/* Never take >16b case for MRST SPIC */
 		dev_err(&spi->dev, "invalid wordsize\n");
@@ -851,7 +782,6 @@ static void spi_hw_init(struct dw_spi *dws)
 	spi_enable_chip(dws, 0);
 	spi_mask_intr(dws, 0xff);
 	spi_enable_chip(dws, 1);
-	flush(dws);
 
 	/*
 	 * Try to detect the FIFO depth if not set by interface driver,
diff --git a/drivers/spi/dw_spi.h b/drivers/spi/dw_spi.h
index b23e452..7a5e78d 100644
--- a/drivers/spi/dw_spi.h
+++ b/drivers/spi/dw_spi.h
@@ -137,8 +137,6 @@ struct dw_spi {
 	u8			max_bits_per_word;	/* maxim is 16b */
 	u32			dma_width;
 	int			cs_change;
-	int			(*write)(struct dw_spi *dws);
-	int			(*read)(struct dw_spi *dws);
 	irqreturn_t		(*transfer_handler)(struct dw_spi *dws);
 	void			(*cs_control)(u32 command);
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 82b9a42..2e13a14 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1047,8 +1047,8 @@ static u8	*buf;
  * spi_{async,sync}() calls with dma-safe buffers.
  */
 int spi_write_then_read(struct spi_device *spi,
-		const u8 *txbuf, unsigned n_tx,
-		u8 *rxbuf, unsigned n_rx)
+		const void *txbuf, unsigned n_tx,
+		void *rxbuf, unsigned n_rx)
 {
 	static DEFINE_MUTEX(lock);
 
diff --git a/drivers/spi/spi_nuc900.c b/drivers/spi/spi_nuc900.c
index d5be18b..3cd15f6 100644
--- a/drivers/spi/spi_nuc900.c
+++ b/drivers/spi/spi_nuc900.c
@@ -463,7 +463,7 @@ static int __devexit nuc900_spi_remove(struct platform_device *dev)
 
 	platform_set_drvdata(dev, NULL);
 
-	spi_unregister_master(hw->master);
+	spi_bitbang_stop(&hw->bitbang);
 
 	clk_disable(hw->clk);
 	clk_put(hw->clk);
diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c
index 151a95e..1a5fcab 100644
--- a/drivers/spi/spi_s3c24xx.c
+++ b/drivers/spi/spi_s3c24xx.c
@@ -668,7 +668,7 @@ static int __exit s3c24xx_spi_remove(struct platform_device *dev)
 
 	platform_set_drvdata(dev, NULL);
 
-	spi_unregister_master(hw->master);
+	spi_bitbang_stop(&hw->bitbang);
 
 	clk_disable(hw->clk);
 	clk_put(hw->clk);
diff --git a/drivers/spi/spi_sh.c b/drivers/spi/spi_sh.c
index 869a07d..9eedd71 100644
--- a/drivers/spi/spi_sh.c
+++ b/drivers/spi/spi_sh.c
@@ -427,10 +427,10 @@ static int __devexit spi_sh_remove(struct platform_device *pdev)
 {
 	struct spi_sh_data *ss = dev_get_drvdata(&pdev->dev);
 
+	spi_unregister_master(ss->master);
 	destroy_workqueue(ss->workqueue);
 	free_irq(ss->irq, ss);
 	iounmap(ss->addr);
-	spi_master_put(ss->master);
 
 	return 0;
 }
diff --git a/drivers/spi/spi_tegra.c b/drivers/spi/spi_tegra.c
index 891e590..6c3aa6e 100644
--- a/drivers/spi/spi_tegra.c
+++ b/drivers/spi/spi_tegra.c
@@ -578,6 +578,7 @@ static int __devexit spi_tegra_remove(struct platform_device *pdev)
 	master = dev_get_drvdata(&pdev->dev);
 	tspi = spi_master_get_devdata(master);
 
+	spi_unregister_master(master);
 	tegra_dma_free_channel(tspi->rx_dma);
 
 	dma_free_coherent(&pdev->dev, sizeof(u32) * BB_LEN,
@@ -586,7 +587,6 @@ static int __devexit spi_tegra_remove(struct platform_device *pdev)
 	clk_put(tspi->clk);
 	iounmap(tspi->base);
 
-	spi_master_put(master);
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	release_mem_region(r->start, (r->end - r->start) + 1);
 
diff --git a/drivers/video/omap/Makefile b/drivers/video/omap/Makefile
index 49226a1..25db556 100644
--- a/drivers/video/omap/Makefile
+++ b/drivers/video/omap/Makefile
@@ -30,7 +30,6 @@ objs-y$(CONFIG_MACH_OMAP_APOLLON) += lcd_apollon.o
 objs-y$(CONFIG_MACH_OMAP_2430SDP) += lcd_2430sdp.o
 objs-y$(CONFIG_MACH_OMAP_3430SDP) += lcd_2430sdp.o
 objs-y$(CONFIG_MACH_OMAP_LDP) += lcd_ldp.o
-objs-y$(CONFIG_MACH_OMAP2EVM) += lcd_omap2evm.o
 objs-y$(CONFIG_MACH_OMAP3EVM) += lcd_omap3evm.o
 objs-y$(CONFIG_MACH_OMAP3_BEAGLE) += lcd_omap3beagle.o
 objs-y$(CONFIG_FB_OMAP_LCD_MIPID) += lcd_mipid.o
diff --git a/drivers/video/omap/lcd_omap2evm.c b/drivers/video/omap/lcd_omap2evm.c
deleted file mode 100644
index 7e7a65c..0000000
--- a/drivers/video/omap/lcd_omap2evm.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * LCD panel support for the MISTRAL OMAP2EVM board
- *
- * Author: Arun C <arunedarath@mistralsolutions.com>
- *
- * Derived from drivers/video/omap/lcd_omap3evm.c
- * Derived from drivers/video/omap/lcd-apollon.c
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/i2c/twl.h>
-
-#include <plat/mux.h>
-#include <asm/mach-types.h>
-
-#include "omapfb.h"
-
-#define LCD_PANEL_ENABLE_GPIO	154
-#define LCD_PANEL_LR		128
-#define LCD_PANEL_UD		129
-#define LCD_PANEL_INI		152
-#define LCD_PANEL_QVGA		148
-#define LCD_PANEL_RESB		153
-
-#define TWL_LED_LEDEN		0x00
-#define TWL_PWMA_PWMAON		0x00
-#define TWL_PWMA_PWMAOFF	0x01
-
-static unsigned int bklight_level;
-
-static int omap2evm_panel_init(struct lcd_panel *panel,
-				struct omapfb_device *fbdev)
-{
-	gpio_request(LCD_PANEL_ENABLE_GPIO, "LCD enable");
-	gpio_request(LCD_PANEL_LR, "LCD lr");
-	gpio_request(LCD_PANEL_UD, "LCD ud");
-	gpio_request(LCD_PANEL_INI, "LCD ini");
-	gpio_request(LCD_PANEL_QVGA, "LCD qvga");
-	gpio_request(LCD_PANEL_RESB, "LCD resb");
-
-	gpio_direction_output(LCD_PANEL_ENABLE_GPIO, 1);
-	gpio_direction_output(LCD_PANEL_RESB, 1);
-	gpio_direction_output(LCD_PANEL_INI, 1);
-	gpio_direction_output(LCD_PANEL_QVGA, 0);
-	gpio_direction_output(LCD_PANEL_LR, 1);
-	gpio_direction_output(LCD_PANEL_UD, 1);
-
-	twl_i2c_write_u8(TWL4030_MODULE_LED, 0x11, TWL_LED_LEDEN);
-	twl_i2c_write_u8(TWL4030_MODULE_PWMA, 0x01, TWL_PWMA_PWMAON);
-	twl_i2c_write_u8(TWL4030_MODULE_PWMA, 0x02, TWL_PWMA_PWMAOFF);
-	bklight_level = 100;
-
-	return 0;
-}
-
-static void omap2evm_panel_cleanup(struct lcd_panel *panel)
-{
-	gpio_free(LCD_PANEL_RESB);
-	gpio_free(LCD_PANEL_QVGA);
-	gpio_free(LCD_PANEL_INI);
-	gpio_free(LCD_PANEL_UD);
-	gpio_free(LCD_PANEL_LR);
-	gpio_free(LCD_PANEL_ENABLE_GPIO);
-}
-
-static int omap2evm_panel_enable(struct lcd_panel *panel)
-{
-	gpio_set_value(LCD_PANEL_ENABLE_GPIO, 0);
-	return 0;
-}
-
-static void omap2evm_panel_disable(struct lcd_panel *panel)
-{
-	gpio_set_value(LCD_PANEL_ENABLE_GPIO, 1);
-}
-
-static unsigned long omap2evm_panel_get_caps(struct lcd_panel *panel)
-{
-	return 0;
-}
-
-static int omap2evm_bklight_setlevel(struct lcd_panel *panel,
-						unsigned int level)
-{
-	u8 c;
-	if ((level >= 0) && (level <= 100)) {
-		c = (125 * (100 - level)) / 100 + 2;
-		twl_i2c_write_u8(TWL4030_MODULE_PWMA, c, TWL_PWMA_PWMAOFF);
-		bklight_level = level;
-	}
-	return 0;
-}
-
-static unsigned int omap2evm_bklight_getlevel(struct lcd_panel *panel)
-{
-	return bklight_level;
-}
-
-static unsigned int omap2evm_bklight_getmaxlevel(struct lcd_panel *panel)
-{
-	return 100;
-}
-
-struct lcd_panel omap2evm_panel = {
-	.name		= "omap2evm",
-	.config		= OMAP_LCDC_PANEL_TFT | OMAP_LCDC_INV_VSYNC |
-			  OMAP_LCDC_INV_HSYNC,
-
-	.bpp		= 16,
-	.data_lines	= 18,
-	.x_res		= 480,
-	.y_res		= 640,
-	.hsw		= 3,
-	.hfp		= 0,
-	.hbp		= 28,
-	.vsw		= 2,
-	.vfp		= 1,
-	.vbp		= 0,
-
-	.pixel_clock	= 20000,
-
-	.init		= omap2evm_panel_init,
-	.cleanup	= omap2evm_panel_cleanup,
-	.enable		= omap2evm_panel_enable,
-	.disable	= omap2evm_panel_disable,
-	.get_caps	= omap2evm_panel_get_caps,
-	.set_bklight_level      = omap2evm_bklight_setlevel,
-	.get_bklight_level      = omap2evm_bklight_getlevel,
-	.get_bklight_max        = omap2evm_bklight_getmaxlevel,
-};
-
-static int omap2evm_panel_probe(struct platform_device *pdev)
-{
-	omapfb_register_panel(&omap2evm_panel);
-	return 0;
-}
-
-static int omap2evm_panel_remove(struct platform_device *pdev)
-{
-	return 0;
-}
-
-static int omap2evm_panel_suspend(struct platform_device *pdev,
-				   pm_message_t mesg)
-{
-	return 0;
-}
-
-static int omap2evm_panel_resume(struct platform_device *pdev)
-{
-	return 0;
-}
-
-struct platform_driver omap2evm_panel_driver = {
-	.probe		= omap2evm_panel_probe,
-	.remove		= omap2evm_panel_remove,
-	.suspend	= omap2evm_panel_suspend,
-	.resume		= omap2evm_panel_resume,
-	.driver		= {
-		.name	= "omap2evm_lcd",
-		.owner	= THIS_MODULE,
-	},
-};
-
-static int __init omap2evm_panel_drv_init(void)
-{
-	return platform_driver_register(&omap2evm_panel_driver);
-}
-
-static void __exit omap2evm_panel_drv_exit(void)
-{
-	platform_driver_unregister(&omap2evm_panel_driver);
-}
-
-module_init(omap2evm_panel_drv_init);
-module_exit(omap2evm_panel_drv_exit);
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 4781f80..bbc1825 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,5 +1,6 @@
 obj-y	+= grant-table.o features.o events.o manage.o balloon.o
 obj-y	+= xenbus/
+obj-y	+= tmem.o
 
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o			:= $(nostackp)
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
new file mode 100644
index 0000000..816a449
--- /dev/null
+++ b/drivers/xen/tmem.c
@@ -0,0 +1,264 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/cleancache.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
+
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_VERSION_SHIFT        24
+
+
+struct tmem_pool_uuid {
+	u64 uuid_lo;
+	u64 uuid_hi;
+};
+
+struct tmem_oid {
+	u64 oid[3];
+};
+
+#define TMEM_POOL_PRIVATE_UUID	{ 0, 0 }
+
+/* flags for tmem_ops.new_pool */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+
+/* xen tmem foundation ops/hypercalls */
+
+static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
+	u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.pool_id = tmem_pool;
+	op.u.gen.oid[0] = oid.oid[0];
+	op.u.gen.oid[1] = oid.oid[1];
+	op.u.gen.oid[2] = oid.oid[2];
+	op.u.gen.index = index;
+	op.u.gen.tmem_offset = tmem_offset;
+	op.u.gen.pfn_offset = pfn_offset;
+	op.u.gen.len = len;
+	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
+				u32 flags, unsigned long pagesize)
+{
+	struct tmem_op op;
+	int rc = 0, pageshift;
+
+	for (pageshift = 0; pagesize != 1; pageshift++)
+		pagesize >>= 1;
+	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
+	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+	op.cmd = TMEM_NEW_POOL;
+	op.u.new.uuid[0] = uuid.uuid_lo;
+	op.u.new.uuid[1] = uuid.uuid_hi;
+	op.u.new.flags = flags;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+/* xen generic tmem ops */
+
+static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
+			     u32 index, unsigned long pfn)
+{
+	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+
+	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
+		gmfn, 0, 0, 0);
+}
+
+static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
+			     u32 index, unsigned long pfn)
+{
+	unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+
+	return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
+		gmfn, 0, 0, 0);
+}
+
+static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
+{
+	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
+		0, 0, 0, 0);
+}
+
+static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
+{
+	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+	struct tmem_oid oid = { { 0 } };
+
+	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+int tmem_enabled;
+
+static int __init enable_tmem(char *s)
+{
+	tmem_enabled = 1;
+	return 1;
+}
+
+__setup("tmem", enable_tmem);
+
+/* cleancache ops */
+
+static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
+				     pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+	unsigned long pfn = page_to_pfn(page);
+
+	if (pool < 0)
+		return;
+	if (ind != index)
+		return;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	(void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+}
+
+static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
+				    pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+	unsigned long pfn = page_to_pfn(page);
+	int ret;
+
+	/* translate return values to linux semantics */
+	if (pool < 0)
+		return -1;
+	if (ind != index)
+		return -1;
+	ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
+	if (ret == 1)
+		return 0;
+	else
+		return -1;
+}
+
+static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
+				       pgoff_t index)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (pool < 0)
+		return;
+	if (ind != index)
+		return;
+	(void)xen_tmem_flush_page((u32)pool, oid, ind);
+}
+
+static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
+{
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (pool < 0)
+		return;
+	(void)xen_tmem_flush_object((u32)pool, oid);
+}
+
+static void tmem_cleancache_flush_fs(int pool)
+{
+	if (pool < 0)
+		return;
+	(void)xen_tmem_destroy_pool((u32)pool);
+}
+
+static int tmem_cleancache_init_fs(size_t pagesize)
+{
+	struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
+
+	return xen_tmem_new_pool(uuid_private, 0, pagesize);
+}
+
+static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	struct tmem_pool_uuid shared_uuid;
+
+	shared_uuid.uuid_lo = *(u64 *)uuid;
+	shared_uuid.uuid_hi = *(u64 *)(&uuid[8]);
+	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
+}
+
+static int use_cleancache = 1;
+
+static int __init no_cleancache(char *s)
+{
+	use_cleancache = 0;
+	return 1;
+}
+
+__setup("nocleancache", no_cleancache);
+
+static struct cleancache_ops tmem_cleancache_ops = {
+	.put_page = tmem_cleancache_put_page,
+	.get_page = tmem_cleancache_get_page,
+	.flush_page = tmem_cleancache_flush_page,
+	.flush_inode = tmem_cleancache_flush_inode,
+	.flush_fs = tmem_cleancache_flush_fs,
+	.init_shared_fs = tmem_cleancache_init_shared_fs,
+	.init_fs = tmem_cleancache_init_fs
+};
+
+static int __init xen_tmem_init(void)
+{
+	struct cleancache_ops old_ops;
+
+	if (!xen_domain())
+		return 0;
+#ifdef CONFIG_CLEANCACHE
+	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+	if (tmem_enabled && use_cleancache) {
+		char *s = "";
+		old_ops = cleancache_register_ops(&tmem_cleancache_ops);
+		if (old_ops.init_fs != NULL)
+			s = " (WARNING: cleancache_ops overridden)";
+		printk(KERN_INFO "cleancache enabled, RAM provided by "
+				 "Xen Transcendent Memory%s\n", s);
+	}
+#endif
+	return 0;
+}
+
+module_init(xen_tmem_init)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c677..8d7f3e6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
+	dentry_unhash(d);
 	return v9fs_remove(i, d, 1);
 }
 
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
diff --git a/fs/Kconfig b/fs/Kconfig
index f6edba2..19891aa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	bool
+	tristate
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc..03330e2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
 		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
+	dentry_unhash(dentry);
+
 	return affs_remove_header(dentry);
 }
 
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh = NULL;
 	int retval;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
 		 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
 		 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f..2c4e051 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
+	dentry_unhash(dentry);
+
 	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct key *key;
 	int ret;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	vnode = AFS_FS_I(old_dentry->d_inode);
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23..87d95a8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
+	dentry_unhash(dentry);
+
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
 		if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebf..c7d1d06 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct bfs_sb_info *info;
 	int error = -ENOENT;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	old_bh = new_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96fcfa5..4f98932 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -11,6 +11,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
 	set_page_extent_mapped(page);
 
+	if (!PageUptodate(page)) {
+		if (cleancache_get_page(page) == 0) {
+			BUG_ON(blocksize != PAGE_SIZE);
+			goto out;
+		}
+	}
+
 	end = page_end;
 	while (1) {
 		lock_extent(tree, start, end, GFP_NOFS);
@@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		cur = cur + iosize;
 		page_offset += iosize;
 	}
+out:
 	if (!nr) {
 		if (!PageError(page))
 			SetPageUptodate(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ac712e..be4ffa1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/cleancache.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_root = root_dentry;
 
 	save_mount_options(sb, data);
+	cleancache_init_fs(sb);
 	return 0;
 
 fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e..698c6b2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
 	invalidate_bh_lrus();
 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
  */
-int
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		   get_block_t get_block)
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+			 get_block_t get_block)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+	int ret;
 
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
-		/* page got truncated out from underneath us */
-		unlock_page(page);
-		goto out;
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-	if (unlikely(ret)) {
-		unlock_page(page);
-		if (ret == -ENOMEM)
-			ret = VM_FAULT_OOM;
-		else /* -ENOSPC, -EIO, etc */
-			ret = VM_FAULT_SIGBUS;
-	} else
-		ret = VM_FAULT_LOCKED;
-
-out:
+	if (unlikely(ret < 0))
+		goto out_unlock;
+	/*
+	 * Freezing in progress? We check after the page is marked dirty and
+	 * with page lock held so if the test here fails, we are sure freezing
+	 * code will wait during syncing until the page fault is done - at that
+	 * point page will be dirty and unlocked so freezing code will write it
+	 * and writeprotect it again.
+	 */
+	set_page_dirty(page);
+	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	return 0;
+out_unlock:
+	unlock_page(page);
 	return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		   get_block_t get_block)
+{
+	int ret;
+	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+
+	/*
+	 * This check is racy but catches the common case. The check in
+	 * __block_page_mkwrite() is reliable.
+	 */
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 
 /*
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4..a46126f 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 	int error;
 
+	dentry_unhash(de);
+
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
 		/* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int new_length = new_dentry->d_name.len;
 	int error;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
 			     (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b..9d17d35 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
+	dentry_unhash(dentry);
+
 	if (dentry->d_parent == configfs_sb->s_root)
 		return -EPERM;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a..227b409 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct dentry *lower_dir_dentry;
 	int rc;
 
+	dentry_unhash(dentry);
+
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap = NULL;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
 	dget(lower_old_dentry);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0..aad153e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/log2.h>
+#include <linux/cleancache.h>
 
 #include <asm/uaccess.h>
 
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 	} else {
 		ext3_msg(sb, KERN_INFO, "using internal journal");
 	}
+	cleancache_init_fs(sb);
 	return res;
 }
 
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36..04109460 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+		mmp.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139..264f694 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:			handle to this transaction
- * @sb:				super block
- * @block:			start physcial block to add to the block group
- * @count:			number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-			 ext4_fsblk_t block, unsigned long count)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gd_bh;
-	ext4_group_t block_group;
-	ext4_grpblk_t bit;
-	unsigned int i;
-	struct ext4_group_desc *desc;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int err = 0, ret, blk_free_count;
-	ext4_grpblk_t blocks_freed;
-	struct ext4_group_info *grp;
-
-	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-
-	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-	grp = ext4_get_group_info(sb, block_group);
-	/*
-	 * Check to see if we are freeing blocks across a group
-	 * boundary.
-	 */
-	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		goto error_return;
-	}
-	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
-		goto error_return;
-	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!desc)
-		goto error_return;
-
-	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group)) {
-		ext4_error(sb, "Adding blocks in system zones - "
-			   "Block = %llu, count = %lu",
-			   block, count);
-		goto error_return;
-	}
-
-	/*
-	 * We are about to add blocks to the bitmap,
-	 * so we need undo access.
-	 */
-	BUFFER_TRACE(bitmap_bh, "getting undo access");
-	err = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gd_bh);
-	if (err)
-		goto error_return;
-	/*
-	 * make sure we don't allow a parallel init on other groups in the
-	 * same buddy cache
-	 */
-	down_write(&grp->alloc_sem);
-	for (i = 0, blocks_freed = 0; i < count; i++) {
-		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-						bit + i, bitmap_bh->b_data)) {
-			ext4_error(sb, "bit already cleared for block %llu",
-				   (ext4_fsblk_t)(block + i));
-			BUFFER_TRACE(bitmap_bh, "bit already cleared");
-		} else {
-			blocks_freed++;
-		}
-	}
-	ext4_lock_group(sb, block_group);
-	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-	ext4_free_blks_set(sb, desc, blk_free_count);
-	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	ext4_unlock_group(sb, block_group);
-	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(blocks_freed,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
-	}
-	/*
-	 * request to reload the buddy with the
-	 * new bitmap information
-	 */
-	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-	grp->bb_free += blocks_freed;
-	up_write(&grp->alloc_sem);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-	if (!err)
-		err = ret;
-
-error_return:
-	brelse(bitmap_bh);
-	ext4_std_error(sb, err);
-	return;
-}
-
-/**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
  * @nblocks:	number of needed blocks
@@ -493,7 +369,8 @@ error_return:
  * Check if filesystem has nblocks free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+				s64 nblocks, unsigned int flags)
 {
 	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 						EXT4_FREEBLOCKS_WATERMARK) {
 		free_blocks  = percpu_counter_sum_positive(fbc);
 		dirty_blocks = percpu_counter_sum_positive(dbc);
-		if (dirty_blocks < 0) {
-			printk(KERN_CRIT "Dirty block accounting "
-					"went wrong %lld\n",
-					(long long)dirty_blocks);
-		}
 	}
 	/* Check whether we have space after
 	 * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 	/* Hm, nope.  Are (enough) root reserved blocks available? */
 	if (sbi->s_resuid == current_fsuid() ||
 	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-	    capable(CAP_SYS_RESOURCE)) {
+	    capable(CAP_SYS_RESOURCE) ||
+		(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
 		if (free_blocks >= (nblocks + dirty_blocks))
 			return 1;
 	}
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						s64 nblocks)
+			   s64 nblocks, unsigned int flags)
 {
-	if (ext4_has_free_blocks(sbi, nblocks)) {
+	if (ext4_has_free_blocks(sbi, nblocks, flags)) {
 		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
 		return 0;
 	} else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
 	    (*retries)++ > 3 ||
 	    !EXT4_SB(sb)->s_journal)
 		return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * error stores in errp pointer
  */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, unsigned long *count, int *errp)
+				  ext4_fsblk_t goal, unsigned int flags,
+				  unsigned long *count, int *errp)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = count ? *count : 1;
+	ar.flags = flags;
 
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..a74b89c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_DELALLOC_RESERVED	0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC		0x0800
-
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
  */
 #define	EXT4_BAD_INO		 1	/* Bad blocks inode */
 #define EXT4_ROOT_INO		 2	/* Root inode */
+#define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
+#define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
 #define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
 #define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
 	/* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+	/* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
+	/* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 
 /*
  * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
-	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
 #endif
+	/* ext4 extent cache stats */
+	unsigned long extent_cache_hits;
+	unsigned long extent_cache_misses;
 
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 
+#define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
-					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
  */
 struct ext4_lazy_init {
 	unsigned long		li_state;
-
-	wait_queue_head_t	li_wait_daemon;
-	wait_queue_head_t	li_wait_task;
-	struct timer_list	li_timer;
-	struct task_struct	*li_task;
-
 	struct list_head	li_request_list;
 	struct mutex		li_list_mtx;
 };
@@ -1615,6 +1639,67 @@ struct ext4_features {
 };
 
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
  * Function prototypes
  */
 
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-				ext4_fsblk_t block, unsigned long count);
+					 ext4_fsblk_t goal,
+					 unsigned int flags,
+					 unsigned long *count,
+					 int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+				  s64 nblocks, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
 /* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
+						       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
 				    struct super_block *, ext4_group_t, \
 				    unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+				loff_t length);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       int len,
 			       struct writeback_control *wbc);
 
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef..f5240aa 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
 
 #include <trace/events/ext4.h>
 
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-				   handle_t *handle, struct buffer_head *bh)
-{
-	int err = 0;
-
-	if (ext4_handle_valid(handle)) {
-		err = jbd2_journal_get_undo_access(handle, bh);
-		if (err)
-			ext4_journal_abort_handle(where, line, __func__, bh,
-						  handle, err);
-	}
-	return err;
-}
-
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
 				    handle_t *handle, struct buffer_head *bh)
 {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f5353..bb85757 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
 			       const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-				   handle_t *handle, struct buffer_head *bh);
-
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
 				    handle_t *handle, struct buffer_head *bh);
 
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
 			      handle_t *handle, struct super_block *sb);
 
-#define ext4_journal_get_undo_access(handle, bh) \
-	__ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
 	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f..5199bac 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
 
 #include <trace/events/ext4.h>
 
+static int ext4_split_extent(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_map_blocks *map,
+				int split_flag,
+				int flags);
+
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 static ext4_fsblk_t
 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
-			struct ext4_extent *ex, int *err)
+			struct ext4_extent *ex, int *err, unsigned int flags)
 {
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, err);
 	return newblock;
 }
 
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	}
 	ext_debug("\n");
 }
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+			ext4_fsblk_t newblock, int level)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+
+	if (depth != level) {
+		struct ext4_extent_idx *idx;
+		idx = path[level].p_idx;
+		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+			ext_debug("%d: move %d:%llu in new index %llu\n", level,
+					le32_to_cpu(idx->ei_block),
+					ext4_idx_pblock(idx),
+					newblock);
+			idx++;
+		}
+
+		return;
+	}
+
+	ex = path[depth].p_ext;
+	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_pblock(ex),
+				ext4_ext_is_uninitialized(ex),
+				ext4_ext_get_actual_len(ex),
+				newblock);
+		ex++;
+	}
+}
+
 #else
 #define ext4_ext_show_path(inode, path)
 #define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
  * - initializes subtree
  */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path,
-				struct ext4_extent *newext, int at)
+			  unsigned int flags,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *newext, int at)
 {
 	struct buffer_head *bh = NULL;
 	int depth = ext_depth(inode);
 	struct ext4_extent_header *neh;
 	struct ext4_extent_idx *fidx;
-	struct ext4_extent *ex;
 	int i = at, k, m, a;
 	ext4_fsblk_t newblock, oldblock;
 	__le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
 		newblock = ext4_ext_new_meta_block(handle, inode, path,
-						   newext, &err);
+						   newext, &err, flags);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	neh->eh_depth = 0;
-	ex = EXT_FIRST_EXTENT(neh);
 
 	/* move remainder of path[depth] to the new leaf */
 	if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 	/* start copy from next extent */
-	/* TODO: we could do it by single memmove */
-	m = 0;
-	path[depth].p_ext++;
-	while (path[depth].p_ext <=
-			EXT_MAX_EXTENT(path[depth].p_hdr)) {
-		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-				le32_to_cpu(path[depth].p_ext->ee_block),
-				ext4_ext_pblock(path[depth].p_ext),
-				ext4_ext_is_uninitialized(path[depth].p_ext),
-				ext4_ext_get_actual_len(path[depth].p_ext),
-				newblock);
-		/*memmove(ex++, path[depth].p_ext++,
-				sizeof(struct ext4_extent));
-		neh->eh_entries++;*/
-		path[depth].p_ext++;
-		m++;
-	}
+	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+	ext4_ext_show_move(inode, path, newblock, depth);
 	if (m) {
-		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+		struct ext4_extent *ex;
+		ex = EXT_FIRST_EXTENT(neh);
+		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
 		le16_add_cpu(&neh->eh_entries, m);
 	}
 
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
 				i, newblock, le32_to_cpu(border), oldblock);
-		/* copy indexes */
-		m = 0;
-		path[i].p_idx++;
 
-		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-				EXT_MAX_INDEX(path[i].p_hdr));
+		/* move remainder of path[i] to the new index block */
 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
 					EXT_LAST_INDEX(path[i].p_hdr))) {
 			EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			err = -EIO;
 			goto cleanup;
 		}
-		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
-			ext_debug("%d: move %d:%llu in new index %llu\n", i,
-					le32_to_cpu(path[i].p_idx->ei_block),
-					ext4_idx_pblock(path[i].p_idx),
-					newblock);
-			/*memmove(++fidx, path[i].p_idx++,
-					sizeof(struct ext4_extent_idx));
-			neh->eh_entries++;
-			BUG_ON(neh->eh_entries > neh->eh_max);*/
-			path[i].p_idx++;
-			m++;
-		}
+		/* start copy indexes */
+		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		ext4_ext_show_move(inode, path, newblock, i);
 		if (m) {
-			memmove(++fidx, path[i].p_idx - m,
+			memmove(++fidx, path[i].p_idx,
 				sizeof(struct ext4_extent_idx) * m);
 			le16_add_cpu(&neh->eh_entries, m);
 		}
@@ -1056,8 +1073,9 @@ cleanup:
  *   just created block
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+				 unsigned int flags,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *newext)
 {
 	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, path,
+		newext, &err, flags);
 	if (newblock == 0)
 		return err;
 
@@ -1140,8 +1159,9 @@ out:
  * if no free index is found, then it requests in-depth growing.
  */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+				    unsigned int flags,
+				    struct ext4_ext_path *path,
+				    struct ext4_extent *newext)
 {
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
 	if (EXT_HAS_FREE_INDEX(curp)) {
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
-		err = ext4_ext_split(handle, inode, path, newext, i);
+		err = ext4_ext_split(handle, inode, flags, path, newext, i);
 		if (err)
 			goto out;
 
@@ -1174,7 +1194,8 @@ repeat:
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, flags,
+					    path, newext);
 		if (err)
 			goto out;
 
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *ex)
 {
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
 }
 
 /*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+				  struct ext4_ext_path *path,
+				  struct ext4_extent *ex) {
+	struct ext4_extent_header *eh;
+	unsigned int depth;
+	int merge_done = 0;
+	int ret = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	if (ex > EXT_FIRST_EXTENT(eh))
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+	if (!merge_done)
+		ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+	return ret;
+}
+
+/*
  * check if a portion of the "newext" extent overlaps with an
  * existing extent.
  *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	int depth, len, err;
 	ext4_lblk_t next;
 	unsigned uninitialized = 0;
+	int flags = 0;
 
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
 	 * There is no free space in the found leaf.
 	 * We're gonna add a new leaf in the tree.
 	 */
-	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+		flags = EXT4_MB_USE_ROOT_BLOCKS;
+	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
 	if (err)
 		goto cleanup;
 	depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
  * Return 0 if cache is invalid; 1 if the cache is valid
  */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-			struct ext4_extent *ex)
-{
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+	struct ext4_ext_cache *ex){
 	struct ext4_ext_cache *cex;
+	struct ext4_sb_info *sbi;
 	int ret = 0;
 
 	/*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
+	sbi = EXT4_SB(inode->i_sb);
 
 	/* has cache valid data? */
 	if (cex->ec_len == 0)
 		goto errout;
 
 	if (in_range(block, cex->ec_block, cex->ec_len)) {
-		ex->ee_block = cpu_to_le32(cex->ec_block);
-		ext4_ext_store_pblock(ex, cex->ec_start);
-		ex->ee_len = cpu_to_le16(cex->ec_len);
+		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
 		ext_debug("%u cached by %u:%u:%llu\n",
 				block,
 				cex->ec_block, cex->ec_len, cex->ec_start);
 		ret = 1;
 	}
 errout:
+	if (!ret)
+		sbi->extent_cache_misses++;
+	else
+		sbi->extent_cache_hits++;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	return ret;
 }
 
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+			struct ext4_extent *ex)
+{
+	struct ext4_ext_cache cex;
+	int ret = 0;
+
+	if (ext4_ext_check_cache(inode, block, &cex)) {
+		ex->ee_block = cpu_to_le32(cex.ec_block);
+		ext4_ext_store_pblock(ex, cex.ec_start);
+		ex->ee_len = cpu_to_le16(cex.ec_len);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+
+/*
  * ext4_ext_rm_idx:
  * removes index from the index block.
  * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		ext4_free_blocks(handle, inode, NULL, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
-			from, to, le32_to_cpu(ex->ee_block), ee_len);
+		/* head removal */
+		ext4_lblk_t num;
+		ext4_fsblk_t start;
+
+		num = to - from;
+		start = ext4_ext_pblock(ex);
+
+		ext_debug("free first %u blocks starting %llu\n", num, start);
+		ext4_free_blocks(handle, inode, 0, start, num, flags);
+
 	} else {
 		printk(KERN_INFO "strange request: removal(2) "
 				"%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, ext4_lblk_t start)
+		struct ext4_ext_path *path, ext4_lblk_t start,
+		ext4_lblk_t end)
 {
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
+	struct ext4_map_blocks map;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
-		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
-			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+		b = ex_ee_block+ex_ee_len - 1 < end ?
+			ex_ee_block+ex_ee_len - 1 : end;
 
 		ext_debug("  border %u:%u\n", a, b);
 
-		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
-			block = 0;
-			num = 0;
-			BUG();
+		/* If this extent is beyond the end of the hole, skip it */
+		if (end <= ex_ee_block) {
+			ex--;
+			ex_ee_block = le32_to_cpu(ex->ee_block);
+			ex_ee_len = ext4_ext_get_actual_len(ex);
+			continue;
+		} else if (a != ex_ee_block &&
+			b != ex_ee_block + ex_ee_len - 1) {
+			/*
+			 * If this is a truncate, then this condition should
+			 * never happen because at least one of the end points
+			 * needs to be on the edge of the extent.
+			 */
+			if (end == EXT_MAX_BLOCK) {
+				ext_debug("  bad truncate %u:%u\n",
+						start, end);
+				block = 0;
+				num = 0;
+				err = -EIO;
+				goto out;
+			}
+			/*
+			 * else this is a hole punch, so the extent needs to
+			 * be split since neither edge of the hole is on the
+			 * extent edge
+			 */
+			else{
+				map.m_pblk = ext4_ext_pblock(ex);
+				map.m_lblk = ex_ee_block;
+				map.m_len = b - ex_ee_block;
+
+				err = ext4_split_extent(handle,
+					inode, path, &map, 0,
+					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+					EXT4_GET_BLOCKS_PRE_IO);
+
+				if (err < 0)
+					goto out;
+
+				ex_ee_len = ext4_ext_get_actual_len(ex);
+
+				b = ex_ee_block+ex_ee_len - 1 < end ?
+					ex_ee_block+ex_ee_len - 1 : end;
+
+				/* Then remove tail of this extent */
+				block = ex_ee_block;
+				num = a - block;
+			}
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
 			block = ex_ee_block;
 			num = a - block;
 		} else if (b != ex_ee_block + ex_ee_len - 1) {
 			/* remove head of the extent */
-			block = a;
-			num = b - a;
-			/* there is no "make a hole" API yet */
-			BUG();
+			block = b;
+			num =  ex_ee_block + ex_ee_len - b;
+
+			/*
+			 * If this is a truncate, this condition
+			 * should never happen
+			 */
+			if (end == EXT_MAX_BLOCK) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
 		} else {
 			/* remove whole extent: excellent! */
 			block = ex_ee_block;
 			num = 0;
-			BUG_ON(a != ex_ee_block);
-			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+			if (a != ex_ee_block) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
+
+			if (b != ex_ee_block + ex_ee_len - 1) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
 		}
 
 		/*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (num == 0) {
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-			le16_add_cpu(&eh->eh_entries, -1);
+		} else if (block != ex_ee_block) {
+			/*
+			 * If this was a head removal, then we need to update
+			 * the physical block since it is now at a different
+			 * location
+			 */
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
 		}
 
 		ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
+		/*
+		 * If the extent was completely released,
+		 * we need to remove it from the leaf
+		 */
+		if (num == 0) {
+			if (end != EXT_MAX_BLOCK) {
+				/*
+				 * For hole punching, we need to scoot all the
+				 * extents up when an extent is removed so that
+				 * we dont have blank extents in the middle
+				 */
+				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+					sizeof(struct ext4_extent));
+
+				/* Now get rid of the one at the end */
+				memset(EXT_LAST_EXTENT(eh), 0,
+					sizeof(struct ext4_extent));
+			}
+			le16_add_cpu(&eh->eh_entries, -1);
+		}
+
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
 		ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				ext4_lblk_t end)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
 			/* this is leaf block */
-			err = ext4_ext_rm_leaf(handle, inode, path, start);
+			err = ext4_ext_rm_leaf(handle, inode, path,
+					start, end);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 	return ret;
 }
 
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
+					due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *		 the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path *path,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags)
+{
+	ext4_fsblk_t newblock;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex2 = NULL;
+	unsigned int ee_len, depth;
+	int err = 0;
+
+	ext_debug("ext4_split_extents_at: inode %lu, logical"
+		"block %llu\n", inode->i_ino, (unsigned long long)split);
+
+	ext4_ext_show_leaf(inode, path);
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	newblock = split - ee_block + ext4_ext_pblock(ex);
+
+	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+
+	if (split == ee_block) {
+		/*
+		 * case b: block @split is the block that the extent begins with
+		 * then we just change the state of the extent, and splitting
+		 * is not needed.
+		 */
+		if (split_flag & EXT4_EXT_MARK_UNINIT2)
+			ext4_ext_mark_uninitialized(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+			ext4_ext_try_to_merge(inode, path, ex);
+
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
+	}
+
+	/* case a */
+	memcpy(&orig_ex, ex, sizeof(orig_ex));
+	ex->ee_len = cpu_to_le16(split - ee_block);
+	if (split_flag & EXT4_EXT_MARK_UNINIT1)
+		ext4_ext_mark_uninitialized(ex);
+
+	/*
+	 * path may lead to new leaf, not to original leaf any more
+	 * after ext4_ext_insert_extent() returns,
+	 */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		goto fix_extent_len;
+
+	ex2 = &newex;
+	ex2->ee_block = cpu_to_le32(split);
+	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+	ext4_ext_store_pblock(ex2, newblock);
+	if (split_flag & EXT4_EXT_MARK_UNINIT2)
+		ext4_ext_mark_uninitialized(ex2);
+
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		err = ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_len = cpu_to_le32(ee_len);
+		ext4_ext_try_to_merge(inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
+	} else if (err)
+		goto fix_extent_len;
+
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+
+fix_extent_len:
+	ex->ee_len = orig_ex.ee_len;
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+			      struct inode *inode,
+			      struct ext4_ext_path *path,
+			      struct ext4_map_blocks *map,
+			      int split_flag,
+			      int flags)
+{
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	int err = 0;
+	int uninitialized;
+	int split_flag1, flags1;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	uninitialized = ext4_ext_is_uninitialized(ex);
+
+	if (map->m_lblk + map->m_len < ee_block + ee_len) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+		if (uninitialized)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+				       EXT4_EXT_MARK_UNINIT2;
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk + map->m_len, split_flag1, flags1);
+		if (err)
+			goto out;
+	}
+
+	ext4_ext_drop_refs(path);
+	path = ext4_ext_find_extent(inode, map->m_lblk, path);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	if (map->m_lblk >= ee_block) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		if (uninitialized)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+		if (split_flag & EXT4_EXT_MARK_UNINIT2)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk, split_flag1, flags);
+		if (err)
+			goto out;
+	}
+
+	ext4_ext_show_leaf(inode, path);
+out:
+	return err ? err : map->m_len;
+}
+
 #define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
-	struct ext4_extent *ex, newex, orig_ex;
-	struct ext4_extent *ex1 = NULL;
-	struct ext4_extent *ex2 = NULL;
-	struct ext4_extent *ex3 = NULL;
-	struct ext4_extent_header *eh;
+	struct ext4_map_blocks split_map;
+	struct ext4_extent zero_ex;
+	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
-	ext4_fsblk_t newblock;
 	int err = 0;
-	int ret = 0;
-	int may_zeroout;
+	int split_flag = 0;
 
 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
-	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
-	ex2 = ex;
-	orig_ex.ee_block = ex->ee_block;
-	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
+	WARN_ON(map->m_lblk < ee_block);
 	/*
 	 * It is safe to convert extent to initialized via explicit
 	 * zeroout only if extent is fully insde i_size or new_size.
 	 */
-	may_zeroout = ee_block + ee_len <= eof_block;
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	err = ext4_ext_get_access(handle, inode, path + depth);
-	if (err)
-		goto out;
 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
-		err =  ext4_ext_zeroout(inode, &orig_ex);
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		err = ext4_ext_zeroout(inode, ex);
 		if (err)
-			goto fix_extent_len;
-		/* update the extent length and mark as initialized */
-		ex->ee_block = orig_ex.ee_block;
-		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-		ext4_ext_dirty(handle, inode, path + depth);
-		/* zeroed the full extent */
-		return allocated;
-	}
-
-	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-	if (map->m_lblk > ee_block) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
-	}
-	/*
-	 * for sanity, update the length of the ex2 extent before
-	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
-	 * overlap of blocks.
-	 */
-	if (!ex1 && allocated > map->m_len)
-		ex2->ee_len = cpu_to_le16(map->m_len);
-	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > map->m_len) {
-		unsigned int newdepth;
-		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
-			/*
-			 * map->m_lblk == ee_block is handled by the zerouout
-			 * at the beginning.
-			 * Mark first half uninitialized.
-			 * Mark second half initialized and zero out the
-			 * initialized extent
-			 */
-			ex->ee_block = orig_ex.ee_block;
-			ex->ee_len   = cpu_to_le16(ee_len - allocated);
-			ext4_ext_mark_uninitialized(ex);
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-			ext4_ext_dirty(handle, inode, path + depth);
-
-			ex3 = &newex;
-			ex3->ee_block = cpu_to_le32(map->m_lblk);
-			ext4_ext_store_pblock(ex3, newblock);
-			ex3->ee_len = cpu_to_le16(allocated);
-			err = ext4_ext_insert_extent(handle, inode, path,
-							ex3, 0);
-			if (err == -ENOSPC) {
-				err =  ext4_ext_zeroout(inode, &orig_ex);
-				if (err)
-					goto fix_extent_len;
-				ex->ee_block = orig_ex.ee_block;
-				ex->ee_len   = orig_ex.ee_len;
-				ext4_ext_store_pblock(ex,
-					ext4_ext_pblock(&orig_ex));
-				ext4_ext_dirty(handle, inode, path + depth);
-				/* blocks available from map->m_lblk */
-				return allocated;
-
-			} else if (err)
-				goto fix_extent_len;
-
-			/*
-			 * We need to zero out the second half because
-			 * an fallocate request can update file size and
-			 * converting the second half to initialized extent
-			 * implies that we can leak some junk data to user
-			 * space.
-			 */
-			err =  ext4_ext_zeroout(inode, ex3);
-			if (err) {
-				/*
-				 * We should actually mark the
-				 * second half as uninit and return error
-				 * Insert would have changed the extent
-				 */
-				depth = ext_depth(inode);
-				ext4_ext_drop_refs(path);
-				path = ext4_ext_find_extent(inode, map->m_lblk,
-							    path);
-				if (IS_ERR(path)) {
-					err = PTR_ERR(path);
-					return err;
-				}
-				/* get the second half extent details */
-				ex = path[depth].p_ext;
-				err = ext4_ext_get_access(handle, inode,
-								path + depth);
-				if (err)
-					return err;
-				ext4_ext_mark_uninitialized(ex);
-				ext4_ext_dirty(handle, inode, path + depth);
-				return err;
-			}
-
-			/* zeroed the second half */
-			return allocated;
-		}
-		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-		ext4_ext_store_pblock(ex3, newblock + map->m_len);
-		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-		ext4_ext_mark_uninitialized(ex3);
-		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-		if (err == -ENOSPC && may_zeroout) {
-			err =  ext4_ext_zeroout(inode, &orig_ex);
-			if (err)
-				goto fix_extent_len;
-			/* update the extent length and mark as initialized */
-			ex->ee_block = orig_ex.ee_block;
-			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-			ext4_ext_dirty(handle, inode, path + depth);
-			/* zeroed the full extent */
-			/* blocks available from map->m_lblk */
-			return allocated;
-
-		} else if (err)
-			goto fix_extent_len;
-		/*
-		 * The depth, and hence eh & ex might change
-		 * as part of the insert above.
-		 */
-		newdepth = ext_depth(inode);
-		/*
-		 * update the extent length after successful insert of the
-		 * split extent
-		 */
-		ee_len -= ext4_ext_get_actual_len(ex3);
-		orig_ex.ee_len = cpu_to_le16(ee_len);
-		may_zeroout = ee_block + ee_len <= eof_block;
-
-		depth = newdepth;
-		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, path);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
 			goto out;
-		}
-		eh = path[depth].p_hdr;
-		ex = path[depth].p_ext;
-		if (ex2 != &newex)
-			ex2 = ex;
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			goto out;
-
-		allocated = map->m_len;
-
-		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
-		 * to insert a extent in the middle zerout directly
-		 * otherwise give the extent a chance to merge to left
-		 */
-		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-			map->m_lblk != ee_block && may_zeroout) {
-			err =  ext4_ext_zeroout(inode, &orig_ex);
-			if (err)
-				goto fix_extent_len;
-			/* update the extent length and mark as initialized */
-			ex->ee_block = orig_ex.ee_block;
-			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-			ext4_ext_dirty(handle, inode, path + depth);
-			/* zero out the first half */
-			/* blocks available from map->m_lblk */
-			return allocated;
-		}
-	}
-	/*
-	 * If there was a change of depth as part of the
-	 * insertion of ex3 above, we need to update the length
-	 * of the ex1 extent again here
-	 */
-	if (ex1 && ex1 != ex) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
-	}
-	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-	ex2->ee_block = cpu_to_le32(map->m_lblk);
-	ext4_ext_store_pblock(ex2, newblock);
-	ex2->ee_len = cpu_to_le16(allocated);
-	if (ex2 != ex)
-		goto insert;
-	/*
-	 * New (initialized) extent starts from the first block
-	 * in the current extent. i.e., ex2 == ex
-	 * We have to see if it can be merged with the extent
-	 * on the left.
-	 */
-	if (ex2 > EXT_FIRST_EXTENT(eh)) {
-		/*
-		 * To merge left, pass "ex2 - 1" to try_to_merge(),
-		 * since it merges towards right _only_.
-		 */
-		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-		if (ret) {
-			err = ext4_ext_correct_indexes(handle, inode, path);
-			if (err)
-				goto out;
-			depth = ext_depth(inode);
-			ex2--;
-		}
+		ext4_ext_mark_initialized(ex);
+		ext4_ext_try_to_merge(inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
 	}
+
 	/*
-	 * Try to Merge towards right. This might be required
-	 * only when the whole extent is being written to.
-	 * i.e. ex2 == ex and ex3 == NULL.
+	 * four cases:
+	 * 1. split the extent into three extents.
+	 * 2. split the extent into two extents, zeroout the first half.
+	 * 3. split the extent into two extents, zeroout the second half.
+	 * 4. split the extent into two extents with out zeroout.
 	 */
-	if (!ex3) {
-		ret = ext4_ext_try_to_merge(inode, path, ex2);
-		if (ret) {
-			err = ext4_ext_correct_indexes(handle, inode, path);
+	split_map.m_lblk = map->m_lblk;
+	split_map.m_len = map->m_len;
+
+	if (allocated > map->m_len) {
+		if (allocated <= EXT4_EXT_ZERO_LEN &&
+		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+			/* case 3 */
+			zero_ex.ee_block =
+					 cpu_to_le32(map->m_lblk);
+			zero_ex.ee_len = cpu_to_le16(allocated);
+			ext4_ext_store_pblock(&zero_ex,
+				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+			err = ext4_ext_zeroout(inode, &zero_ex);
 			if (err)
 				goto out;
+			split_map.m_lblk = map->m_lblk;
+			split_map.m_len = allocated;
+		} else if ((map->m_lblk - ee_block + map->m_len <
+			   EXT4_EXT_ZERO_LEN) &&
+			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+			/* case 2 */
+			if (map->m_lblk != ee_block) {
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+							ee_block);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+				err = ext4_ext_zeroout(inode, &zero_ex);
+				if (err)
+					goto out;
+			}
+
+			split_map.m_lblk = ee_block;
+			split_map.m_len = map->m_lblk - ee_block + map->m_len;
+			allocated = map->m_len;
 		}
 	}
-	/* Mark modified extent as dirty */
-	err = ext4_ext_dirty(handle, inode, path + depth);
-	goto out;
-insert:
-	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-	if (err == -ENOSPC && may_zeroout) {
-		err =  ext4_ext_zeroout(inode, &orig_ex);
-		if (err)
-			goto fix_extent_len;
-		/* update the extent length and mark as initialized */
-		ex->ee_block = orig_ex.ee_block;
-		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-		ext4_ext_dirty(handle, inode, path + depth);
-		/* zero out the first half */
-		return allocated;
-	} else if (err)
-		goto fix_extent_len;
+
+	allocated = ext4_split_extent(handle, inode, path,
+				       &split_map, split_flag, 0);
+	if (allocated < 0)
+		err = allocated;
+
 out:
-	ext4_ext_show_leaf(inode, path);
 	return err ? err : allocated;
-
-fix_extent_len:
-	ex->ee_block = orig_ex.ee_block;
-	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-	ext4_ext_mark_uninitialized(ex);
-	ext4_ext_dirty(handle, inode, path + depth);
-	return err;
 }
 
 /*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 					struct ext4_ext_path *path,
 					int flags)
 {
-	struct ext4_extent *ex, newex, orig_ex;
-	struct ext4_extent *ex1 = NULL;
-	struct ext4_extent *ex2 = NULL;
-	struct ext4_extent *ex3 = NULL;
-	ext4_lblk_t ee_block, eof_block;
-	unsigned int allocated, ee_len, depth;
-	ext4_fsblk_t newblock;
-	int err = 0;
-	int may_zeroout;
+	ext4_lblk_t eof_block;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len;
+	int split_flag = 0, depth;
 
 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
 		eof_block = map->m_lblk + map->m_len;
-
-	depth = ext_depth(inode);
-	ex = path[depth].p_ext;
-	ee_block = le32_to_cpu(ex->ee_block);
-	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
-	ex2 = ex;
-	orig_ex.ee_block = ex->ee_block;
-	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
-
 	/*
 	 * It is safe to convert extent to initialized via explicit
 	 * zeroout only if extent is fully insde i_size or new_size.
 	 */
-	may_zeroout = ee_block + ee_len <= eof_block;
-
-	/*
- 	 * If the uninitialized extent begins at the same logical
- 	 * block where the write begins, and the write completely
- 	 * covers the extent, then we don't need to split it.
- 	 */
-	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
-		return allocated;
-
-	err = ext4_ext_get_access(handle, inode, path + depth);
-	if (err)
-		goto out;
-	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-	if (map->m_lblk > ee_block) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
-	}
-	/*
-	 * for sanity, update the length of the ex2 extent before
-	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
-	 * overlap of blocks.
-	 */
-	if (!ex1 && allocated > map->m_len)
-		ex2->ee_len = cpu_to_le16(map->m_len);
-	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > map->m_len) {
-		unsigned int newdepth;
-		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-		ext4_ext_store_pblock(ex3, newblock + map->m_len);
-		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-		ext4_ext_mark_uninitialized(ex3);
-		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-		if (err == -ENOSPC && may_zeroout) {
-			err =  ext4_ext_zeroout(inode, &orig_ex);
-			if (err)
-				goto fix_extent_len;
-			/* update the extent length and mark as initialized */
-			ex->ee_block = orig_ex.ee_block;
-			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-			ext4_ext_dirty(handle, inode, path + depth);
-			/* zeroed the full extent */
-			/* blocks available from map->m_lblk */
-			return allocated;
-
-		} else if (err)
-			goto fix_extent_len;
-		/*
-		 * The depth, and hence eh & ex might change
-		 * as part of the insert above.
-		 */
-		newdepth = ext_depth(inode);
-		/*
-		 * update the extent length after successful insert of the
-		 * split extent
-		 */
-		ee_len -= ext4_ext_get_actual_len(ex3);
-		orig_ex.ee_len = cpu_to_le16(ee_len);
-		may_zeroout = ee_block + ee_len <= eof_block;
-
-		depth = newdepth;
-		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, path);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
-			goto out;
-		}
-		ex = path[depth].p_ext;
-		if (ex2 != &newex)
-			ex2 = ex;
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
 
-		err = ext4_ext_get_access(handle, inode, path + depth);
-		if (err)
-			goto out;
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+	split_flag |= EXT4_EXT_MARK_UNINIT2;
 
-		allocated = map->m_len;
-	}
-	/*
-	 * If there was a change of depth as part of the
-	 * insertion of ex3 above, we need to update the length
-	 * of the ex1 extent again here
-	 */
-	if (ex1 && ex1 != ex) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
-	}
-	/*
-	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-	 * using direct I/O, uninitialised still.
-	 */
-	ex2->ee_block = cpu_to_le32(map->m_lblk);
-	ext4_ext_store_pblock(ex2, newblock);
-	ex2->ee_len = cpu_to_le16(allocated);
-	ext4_ext_mark_uninitialized(ex2);
-	if (ex2 != ex)
-		goto insert;
-	/* Mark modified extent as dirty */
-	err = ext4_ext_dirty(handle, inode, path + depth);
-	ext_debug("out here\n");
-	goto out;
-insert:
-	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-	if (err == -ENOSPC && may_zeroout) {
-		err =  ext4_ext_zeroout(inode, &orig_ex);
-		if (err)
-			goto fix_extent_len;
-		/* update the extent length and mark as initialized */
-		ex->ee_block = orig_ex.ee_block;
-		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-		ext4_ext_dirty(handle, inode, path + depth);
-		/* zero out the first half */
-		return allocated;
-	} else if (err)
-		goto fix_extent_len;
-out:
-	ext4_ext_show_leaf(inode, path);
-	return err ? err : allocated;
-
-fix_extent_len:
-	ex->ee_block = orig_ex.ee_block;
-	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-	ext4_ext_mark_uninitialized(ex);
-	ext4_ext_dirty(handle, inode, path + depth);
-	return err;
+	flags |= EXT4_GET_BLOCKS_PRE_IO;
+	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
+
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 					      struct inode *inode,
 					      struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	struct ext4_extent_header *eh;
 	int depth;
 	int err = 0;
-	int ret = 0;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 
+	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)le32_to_cpu(ex->ee_block),
+		ext4_ext_get_actual_len(ex));
+
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 	/* first mark the extent as initialized */
 	ext4_ext_mark_initialized(ex);
 
-	/*
-	 * We have to see if it can be merged with the extent
-	 * on the left.
-	 */
-	if (ex > EXT_FIRST_EXTENT(eh)) {
-		/*
-		 * To merge left, pass "ex - 1" to try_to_merge(),
-		 * since it merges towards right _only_.
-		 */
-		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
-		if (ret) {
-			err = ext4_ext_correct_indexes(handle, inode, path);
-			if (err)
-				goto out;
-			depth = ext_depth(inode);
-			ex--;
-		}
-	}
-	/*
-	 * Try to Merge towards right.
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
 	 */
-	ret = ext4_ext_try_to_merge(inode, path, ex);
-	if (ret) {
-		err = ext4_ext_correct_indexes(handle, inode, path);
-		if (err)
-			goto out;
-		depth = ext_depth(inode);
-	}
+	ext4_ext_try_to_merge(inode, path, ex);
+
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + depth);
 out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int err = 0, depth, ret;
 	unsigned int allocated = 0;
+	unsigned int punched_out = 0;
+	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* check in cache */
-	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+	if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+		((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
 				  ee_block, ee_len, newblock);
 
-			/* Do not put uninitialized extent in the cache */
-			if (!ext4_ext_is_uninitialized(ex)) {
-				ext4_ext_put_in_cache(inode, ee_block,
-							ee_len, ee_start);
-				goto out;
+			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
+				/*
+				 * Do not put uninitialized extent
+				 * in the cache
+				 */
+				if (!ext4_ext_is_uninitialized(ex)) {
+					ext4_ext_put_in_cache(inode, ee_block,
+						ee_len, ee_start);
+					goto out;
+				}
+				ret = ext4_ext_handle_uninitialized_extents(
+					handle, inode, map, path, flags,
+					allocated, newblock);
+				return ret;
 			}
-			ret = ext4_ext_handle_uninitialized_extents(handle,
-					inode, map, path, flags, allocated,
-					newblock);
-			return ret;
+
+			/*
+			 * Punch out the map length, but only to the
+			 * end of the extent
+			 */
+			punched_out = allocated < map->m_len ?
+				allocated : map->m_len;
+
+			/*
+			 * Sense extents need to be converted to
+			 * uninitialized, they must fit in an
+			 * uninitialized extent
+			 */
+			if (punched_out > EXT_UNINIT_MAX_LEN)
+				punched_out = EXT_UNINIT_MAX_LEN;
+
+			punch_map.m_lblk = map->m_lblk;
+			punch_map.m_pblk = newblock;
+			punch_map.m_len = punched_out;
+			punch_map.m_flags = 0;
+
+			/* Check to see if the extent needs to be split */
+			if (punch_map.m_len != ee_len ||
+				punch_map.m_lblk != ee_block) {
+
+				ret = ext4_split_extent(handle, inode,
+				path, &punch_map, 0,
+				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+				EXT4_GET_BLOCKS_PRE_IO);
+
+				if (ret < 0) {
+					err = ret;
+					goto out2;
+				}
+				/*
+				 * find extent for the block at
+				 * the start of the hole
+				 */
+				ext4_ext_drop_refs(path);
+				kfree(path);
+
+				path = ext4_ext_find_extent(inode,
+				map->m_lblk, NULL);
+				if (IS_ERR(path)) {
+					err = PTR_ERR(path);
+					path = NULL;
+					goto out2;
+				}
+
+				depth = ext_depth(inode);
+				ex = path[depth].p_ext;
+				ee_len = ext4_ext_get_actual_len(ex);
+				ee_block = le32_to_cpu(ex->ee_block);
+				ee_start = ext4_ext_pblock(ex);
+
+			}
+
+			ext4_ext_mark_uninitialized(ex);
+
+			err = ext4_ext_remove_space(inode, map->m_lblk,
+				map->m_lblk + punched_out);
+
+			goto out2;
 		}
 	}
 
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	else
 		/* disable in-core preallocation for non-regular files */
 		ar.flags = 0;
+	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
@@ -3529,7 +3647,11 @@ out2:
 	}
 	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
 		newblock, map->m_len, err ? err : allocated);
-	return err ? err : allocated;
+
+	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+			punched_out : allocated;
+
+	return err ? err : result;
 }
 
 void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
-	err = ext4_ext_remove_space(inode, last_block);
+	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
 
 	/* In a multi-transaction truncate, we only make the final
 	 * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-out_stop:
 	up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return ext4_punch_hole(file, offset, len);
+
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	map.m_lblk = offset >> blkbits;
 	/*
@@ -3691,7 +3817,8 @@ retry:
 			break;
 		}
 		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+				      EXT4_GET_BLOCKS_NO_NORMALIZE);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		pgoff_t		last_offset;
 		pgoff_t		offset;
 		pgoff_t		index;
+		pgoff_t		start_index = 0;
 		struct page	**pages = NULL;
 		struct buffer_head *bh = NULL;
 		struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
 				kfree(pages);
 				return EXT_CONTINUE;
 			}
+			index = 0;
 
+next_page:
 			/* Try to find the 1st mapped buffer. */
-			end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
 				  blksize_bits;
-			if (!page_has_buffers(pages[0]))
+			if (!page_has_buffers(pages[index]))
 				goto out;
-			head = page_buffers(pages[0]);
+			head = page_buffers(pages[index]);
 			if (!head)
 				goto out;
 
+			index++;
 			bh = head;
 			do {
-				if (buffer_mapped(bh)) {
+				if (end >= newex->ec_block +
+					newex->ec_len)
+					/* The buffer is out of
+					 * the request range.
+					 */
+					goto out;
+
+				if (buffer_mapped(bh) &&
+				    end >= newex->ec_block) {
+					start_index = index - 1;
 					/* get the 1st mapped buffer. */
-					if (end > newex->ec_block +
-						newex->ec_len)
-						/* The buffer is out of
-						 * the request range.
-						 */
-						goto out;
 					goto found_mapped_buffer;
 				}
+
 				bh = bh->b_this_page;
 				end++;
 			} while (bh != head);
 
-			/* No mapped buffer found. */
-			goto out;
+			/* No mapped buffer in the range found in this page,
+			 * We need to look up next page.
+			 */
+			if (index >= ret) {
+				/* There is no page left, but we need to limit
+				 * newex->ec_len.
+				 */
+				newex->ec_len = end - newex->ec_block;
+				goto out;
+			}
+			goto next_page;
 		} else {
 			/*Find contiguous delayed buffers. */
 			if (ret > 0 && pages[0]->index == last_offset)
 				head = page_buffers(pages[0]);
 			bh = head;
+			index = 1;
+			start_index = 0;
 		}
 
 found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
 				end++;
 			} while (bh != head);
 
-			for (index = 1; index < ret; index++) {
+			for (; index < ret; index++) {
 				if (!page_has_buffers(pages[index])) {
 					bh = NULL;
 					break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
 					bh = NULL;
 					break;
 				}
+
 				if (pages[index]->index !=
-					pages[0]->index + index) {
+				    pages[start_index]->index + index
+				    - start_index) {
 					/* Blocks are not contiguous. */
 					bh = NULL;
 					break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	return (error < 0 ? error : 0);
 }
 
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_ext_cache cache_ex;
+	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+	struct address_space *mapping = inode->i_mapping;
+	struct ext4_map_blocks map;
+	handle_t *handle;
+	loff_t first_block_offset, last_block_offset, block_len;
+	loff_t first_page, last_page, first_page_offset, last_page_offset;
+	int ret, credits, blocks_released, err = 0;
+
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+	last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+
+	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+	first_page_offset = first_page << PAGE_CACHE_SHIFT;
+	last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		err = filemap_write_and_wait_range(mapping,
+			first_page_offset == 0 ? 0 : first_page_offset-1,
+			last_page_offset);
+
+			if (err)
+				return err;
+	}
+
+	/* Now release the pages */
+	if (last_page_offset > first_page_offset) {
+		truncate_inode_pages_range(mapping, first_page_offset,
+					   last_page_offset-1);
+	}
+
+	/* finish any pending end_io work */
+	ext4_flush_completed_IO(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_orphan_add(handle, inode);
+	if (err)
+		goto out;
+
+	/*
+	 * Now we need to zero out the un block aligned data.
+	 * If the file is smaller than a block, just
+	 * zero out the middle
+	 */
+	if (first_block > last_block)
+		ext4_block_zero_page_range(handle, mapping, offset, length);
+	else {
+		/* zero out the head of the hole before the first block */
+		block_len  = first_block_offset - offset;
+		if (block_len > 0)
+			ext4_block_zero_page_range(handle, mapping,
+						   offset, block_len);
+
+		/* zero out the tail of the hole after the last block */
+		block_len = offset + length - last_block_offset;
+		if (block_len > 0) {
+			ext4_block_zero_page_range(handle, mapping,
+					last_block_offset, block_len);
+		}
+	}
+
+	/* If there are no blocks to remove, return now */
+	if (first_block >= last_block)
+		goto out;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_ext_invalidate_cache(inode);
+	ext4_discard_preallocations(inode);
+
+	/*
+	 * Loop over all the blocks and identify blocks
+	 * that need to be punched out
+	 */
+	iblock = first_block;
+	blocks_released = 0;
+	while (iblock < last_block) {
+		max_blocks = last_block - iblock;
+		num_blocks = 1;
+		memset(&map, 0, sizeof(map));
+		map.m_lblk = iblock;
+		map.m_len = max_blocks;
+		ret = ext4_ext_map_blocks(handle, inode, &map,
+			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+		if (ret > 0) {
+			blocks_released += ret;
+			num_blocks = ret;
+		} else if (ret == 0) {
+			/*
+			 * If map blocks could not find the block,
+			 * then it is in a hole.  If the hole was
+			 * not already cached, then map blocks should
+			 * put it in the cache.  So we can get the hole
+			 * out of the cache
+			 */
+			memset(&cache_ex, 0, sizeof(cache_ex));
+			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+				!cache_ex.ec_start) {
+
+				/* The hole is cached */
+				num_blocks = cache_ex.ec_block +
+				cache_ex.ec_len - iblock;
+
+			} else {
+				/* The block could not be identified */
+				err = -EIO;
+				break;
+			}
+		} else {
+			/* Map blocks error */
+			err = ret;
+			break;
+		}
+
+		if (num_blocks == 0) {
+			/* This condition should never happen */
+			ext_debug("Block lookup failed");
+			err = -EIO;
+			break;
+		}
+
+		iblock += num_blocks;
+	}
+
+	if (blocks_released > 0) {
+		ext4_ext_invalidate_cache(inode);
+		ext4_discard_preallocations(inode);
+	}
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+	ext4_orphan_del(handle, inode);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+	return err;
+}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	return error;
 }
-
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d54..2c09723 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
 };
 
 const struct inode_operations ext4_file_inode_operations = {
-	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cb..ce66d2f 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
 
 static void dump_completed_IO(struct inode * inode)
 {
-#ifdef	EXT4_DEBUG
+#ifdef	EXT4FS_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
 	unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret;
 	tid_t commit_tid;
+	bool needs_barrier = false;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
 	}
 
 	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-	if (jbd2_log_start_commit(journal, commit_tid)) {
-		/*
-		 * When the journal is on a different device than the
-		 * fs data disk, we need to issue the barrier in
-		 * writeback mode.  (In ordered mode, the jbd2 layer
-		 * will take care of issuing the barrier.  In
-		 * data=journal, all of the data blocks are written to
-		 * the journal device.)
-		 */
-		if (ext4_should_writeback_data(inode) &&
-		    (journal->j_fs_dev != journal->j_dev) &&
-		    (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					NULL);
-		ret = jbd2_log_wait_commit(journal, commit_tid);
-	} else if (journal->j_flags & JBD2_BARRIER)
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+	jbd2_log_start_commit(journal, commit_tid);
+	ret = jbd2_log_wait_commit(journal, commit_tid);
+	if (needs_barrier)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
  out:
 	trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8..50d0e9c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_meta_blocks(handle, inode,
-							goal, &count, err);
+		current_block = ext4_new_meta_blocks(handle, inode, goal,
+						     0, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -1930,7 +1930,7 @@ repeat:
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
 	 */
-	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
 		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
 				continue;
 			}
 
-			if (PageWriteback(page))
-				wait_on_page_writeback(page);
-
+			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
 			if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
 			loff_t end = offset + iov_length(iov, nr_segs);
 
 			if (end > isize)
-				vmtruncate(inode, isize);
+				ext4_truncate_failed_write(inode);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
 int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, length, pos;
+	unsigned blocksize, max, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
 		return -EINVAL;
 
 	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
+	max = blocksize - (offset & (blocksize - 1));
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
+	 */
+	if (length > max || length < 0)
+		length = max;
+
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 
 int ext4_can_truncate(struct inode *inode)
 {
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return 0;
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
 }
 
 /*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	if (!S_ISREG(inode->i_mode))
+		return -ENOTSUPP;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		/* TODO: Add support for non extent hole punching */
+		return -ENOTSUPP;
+	}
+
+	return ext4_ext_punch_hole(file, offset, length);
+}
+
+/*
  * ext4_truncate()
  *
  * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
-	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	inode_offset = ((inode->i_ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
-	    (attr->ia_size < inode->i_size ||
-	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+	    (attr->ia_size < inode->i_size)) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 			}
 		}
-		/* ext4_truncate will clear the flag */
-		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-			ext4_truncate(inode);
 	}
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode))
-		rc = vmtruncate(inode, attr->ia_size);
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != i_size_read(inode)) {
+			truncate_setsize(inode, attr->ia_size);
+			ext4_truncate(inode);
+		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+			ext4_truncate(inode);
+	}
 
 	if (!rc) {
 		setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_unlock;
 	}
 	ret = 0;
-	if (PageMappedToDisk(page))
-		goto out_unlock;
+
+	lock_page(page);
+	wait_on_page_writeback(page);
+	if (PageMappedToDisk(page)) {
+		up_read(&inode->i_alloc_sem);
+		return VM_FAULT_LOCKED;
+	}
 
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 
-	lock_page(page);
 	/*
 	 * return if we have all the buffers mapped. This avoid
 	 * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 					ext4_bh_unmapped)) {
-			unlock_page(page);
-			goto out_unlock;
+			up_read(&inode->i_alloc_sem);
+			return VM_FAULT_LOCKED;
 		}
 	}
 	unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
+
+	/*
+	 * write_begin/end might have created a dirty page and someone
+	 * could wander in and start the IO.  Make sure that hasn't
+	 * happened.
+	 */
+	lock_page(page);
+	wait_on_page_writeback(page);
+	up_read(&inode->i_alloc_sem);
+	return VM_FAULT_LOCKED;
 out_unlock:
 	if (ret)
 		ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16ee..859f2ae 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	struct inode *inode;
 	char *data;
 	char *bitmap;
+	struct ext4_group_info *grinfo;
 
 	mb_debug(1, "init page %lu\n", page->index);
 
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (first_group + i >= ngroups)
 			break;
 
+		grinfo = ext4_get_group_info(sb, first_group + i);
+		/*
+		 * If page is uptodate then we came here after online resize
+		 * which added some new uninitialized group info structs, so
+		 * we must skip all initialized uptodate buddies on the page,
+		 * which may be currently in use by an allocating task.
+		 */
+		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+			bh[i] = NULL;
+			continue;
+		}
+
 		err = -EIO;
 		desc = ext4_get_group_desc(sb, first_group + i, NULL);
 		if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	}
 
 	/* wait for I/O completion */
-	for (i = 0; i < groups_per_page && bh[i]; i++)
-		wait_on_buffer(bh[i]);
+	for (i = 0; i < groups_per_page; i++)
+		if (bh[i])
+			wait_on_buffer(bh[i]);
 
 	err = -EIO;
-	for (i = 0; i < groups_per_page && bh[i]; i++)
-		if (!buffer_uptodate(bh[i]))
+	for (i = 0; i < groups_per_page; i++)
+		if (bh[i] && !buffer_uptodate(bh[i]))
 			goto out;
 
 	err = 0;
 	first_block = page->index * blocks_per_page;
-	/* init the page  */
-	memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
-		struct ext4_group_info *grinfo;
 
 		group = (first_block + i) >> 1;
 		if (group >= ngroups)
 			break;
 
+		if (!bh[group - first_group])
+			/* skip initialized uptodate buddy */
+			continue;
+
 		/*
 		 * data carry information regarding this
 		 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			 * incore got set to the group block bitmap below
 			 */
 			ext4_lock_group(sb, group);
+			/* init the buddy */
+			memset(data, 0xff, blocksize);
 			ext4_mb_generate_buddy(sb, data, incore, group);
 			ext4_unlock_group(sb, group);
 			incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 out:
 	if (bh) {
-		for (i = 0; i < groups_per_page && bh[i]; i++)
+		for (i = 0; i < groups_per_page; i++)
 			brelse(bh[i]);
 		if (bh != &bhs)
 			kfree(bh);
@@ -957,22 +974,21 @@ out:
 }
 
 /*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
  */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
-					ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+		ext4_group_t group, struct ext4_buddy *e4b)
 {
-	int i;
-	int block, pnum;
+	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+	int block, pnum, poff;
 	int blocks_per_page;
-	int groups_per_page;
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
+	struct page *page;
+
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	/*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
 	 */
 	block = group * 2;
 	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-
-	groups_per_page = blocks_per_page >> 1;
-	if (groups_per_page == 0)
-		groups_per_page = 1;
-	/* read all groups the page covers into the cache */
-	for (i = 0; i < groups_per_page; i++) {
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -EIO;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-		if ((first_group + i) >= ngroups)
-			break;
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		down_write_nested(&grp->alloc_sem, i);
+	if (blocks_per_page >= 2) {
+		/* buddy and bitmap are on the same page */
+		return 0;
 	}
-	return i;
+
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -EIO;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_buddy_page = page;
+	return 0;
 }
 
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-					 ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 {
-	int i;
-	int block, pnum;
-	int blocks_per_page;
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
-
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-	/* release locks on all the groups */
-	for (i = 0; i < locked_group; i++) {
-
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		up_write(&grp->alloc_sem);
+	if (e4b->bd_bitmap_page) {
+		unlock_page(e4b->bd_bitmap_page);
+		page_cache_release(e4b->bd_bitmap_page);
+	}
+	if (e4b->bd_buddy_page) {
+		unlock_page(e4b->bd_buddy_page);
+		page_cache_release(e4b->bd_buddy_page);
 	}
-
 }
 
 /*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
 
-	int ret = 0;
-	void *bitmap;
-	int blocks_per_page;
-	int block, pnum, poff;
-	int num_grp_locked = 0;
 	struct ext4_group_info *this_grp;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
-	struct page *page = NULL, *bitmap_page = NULL;
+	struct ext4_buddy e4b;
+	struct page *page;
+	int ret = 0;
 
 	mb_debug(1, "init group %u\n", group);
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
 	 * This ensures that we don't reinit the buddy cache
 	 * page which map to the group from which we are already
 	 * allocating. If we are looking at the buddy cache we would
 	 * have taken a reference using ext4_mb_load_buddy and that
-	 * would have taken the alloc_sem lock.
+	 * would have pinned buddy page to page cache.
 	 */
-	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
 		/*
 		 * somebody initialized the group
 		 * return without doing anything
 		 */
-		ret = 0;
 		goto err;
 	}
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, NULL);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
+
+	page = e4b.bd_bitmap_page;
+	ret = ext4_mb_init_cache(page, NULL);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
-	bitmap_page = page;
-	bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-	/* init buddy cache */
-	block++;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page == bitmap_page) {
+	if (e4b.bd_buddy_page == NULL) {
 		/*
 		 * If both the bitmap and buddy are in
 		 * the same page we don't need to force
 		 * init the buddy
 		 */
-		unlock_page(page);
-	} else if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, bitmap);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
+		ret = 0;
+		goto err;
 	}
-	if (page == NULL || !PageUptodate(page)) {
+	/* init buddy cache */
+	page = e4b.bd_buddy_page;
+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
 err:
-	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-	if (bitmap_page)
-		page_cache_release(bitmap_page);
-	if (page)
-		page_cache_release(page);
+	ext4_mb_put_buddy_page_lock(&e4b);
 	return ret;
 }
 
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
-	e4b->alloc_semp = &grp->alloc_sem;
-
-	/* Take the read lock on the group alloc
-	 * sem. This would make sure a parallel
-	 * ext4_mb_init_group happening on other
-	 * groups mapped by the page is blocked
-	 * till we are done with allocation
-	 */
-repeat_load_buddy:
-	down_read(e4b->alloc_semp);
 
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-		/* we need to check for group need init flag
-		 * with alloc_semp held so that we can be sure
-		 * that new blocks didn't get added to the group
-		 * when we are loading the buddy cache
-		 */
-		up_read(e4b->alloc_semp);
 		/*
 		 * we need full data about the group
 		 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
 		ret = ext4_mb_init_group(sb, group);
 		if (ret)
 			return ret;
-		goto repeat_load_buddy;
 	}
 
 	/*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
 	return 0;
 
 err:
+	if (page)
+		page_cache_release(page);
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-
-	/* Done with the buddy cache */
-	up_read(e4b->alloc_semp);
 	return ret;
 }
 
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
-	/* Done with the buddy cache */
-	if (e4b->alloc_semp)
-		up_read(e4b->alloc_semp);
 }
 
 
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
 	get_page(ac->ac_buddy_page);
-	/* on allocation we use ac to track the held semaphore */
-	ac->alloc_semp =  e4b->alloc_semp;
-	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	struct super_block *sb = journal->j_private;
 	struct ext4_buddy e4b;
 	struct ext4_group_info *db;
-	int err, ret, count = 0, count2 = 0;
+	int err, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
 	struct list_head *l, *ltmp;
 
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
-		if (test_opt(sb, DISCARD)) {
-			ret = ext4_issue_discard(sb, entry->group,
-					entry->start_blk, entry->count);
-			if (unlikely(ret == -EOPNOTSUPP)) {
-				ext4_warning(sb, "discard not supported, "
-						 "disabling");
-				clear_opt(sb, DISCARD);
-			}
-		}
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, entry->group,
+					   entry->start_blk, entry->count);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			spin_unlock(&pa->pa_lock);
 		}
 	}
-	if (ac->alloc_semp)
-		up_read(ac->alloc_semp);
 	if (pa) {
 		/*
 		 * We want to add the pa to the right bucket.
 		 * Remove it from the list and while adding
 		 * make sure the list to which we are adding
-		 * doesn't grow big.  We need to release
-		 * alloc_semp before calling ext4_mb_add_n_trim()
+		 * doesn't grow big.
 		 */
 		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
 			spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		 * there is enough free blocks to do block allocation
 		 * and verify allocation doesn't exceed the quota limits.
 		 */
-		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+		while (ar->len &&
+			ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+
 			/* let others to free the space */
 			yield();
 			ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
-		while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
-			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-			ar->len--;
+		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+			dquot_alloc_block_nofail(ar->inode, ar->len);
+		} else {
+			while (ar->len &&
+				dquot_alloc_block(ar->inode, ar->len)) {
+
+				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+				ar->len--;
+			}
 		}
 		inquota = ar->len;
 		if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
 }
 
 /**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to add to the block group
+ * @count:			number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	ext4_grpblk_t bit;
+	unsigned int i;
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+	struct ext4_group_info *grp;
+
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	grp = ext4_get_group_info(sb, block_group);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+		goto error_return;
+
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group)) {
+		ext4_error(sb, "Adding blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	for (i = 0, blocks_freed = 0; i < count; i++) {
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+			ext4_error(sb, "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			blocks_freed++;
+		}
+	}
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	/*
+	 * need to update group_info->bb_free and bitmap
+	 * with group lock held. generate_buddy look at
+	 * them with group lock_held
+	 */
+	ext4_lock_group(sb, block_group);
+	mb_clear_bits(bitmap_bh->b_data, bit, count);
+	mb_free_blocks(NULL, &e4b, bit, count);
+	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+	ext4_free_blks_set(sb, desc, blk_free_count);
+	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+	}
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
+/**
  * ext4_trim_extent -- function to TRIM one single free extent in the group
  * @sb:		super block for the file system
  * @start:	starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-		ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+			     ext4_group_t group, struct ext4_buddy *e4b)
 {
 	struct ext4_free_extent ex;
-	int ret = 0;
 
 	assert_spin_locked(ext4_group_lock_ptr(sb, group));
 
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 	 */
 	mb_mark_used(e4b, &ex);
 	ext4_unlock_group(sb, group);
-
-	ret = ext4_issue_discard(sb, group, start, count);
-
+	ext4_issue_discard(sb, group, start, count);
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
-	return ret;
 }
 
 /**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
  * the group buddy bitmap. This is done until whole group is scanned.
  */
 static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
-		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+		   ext4_grpblk_t start, ext4_grpblk_t max,
+		   ext4_grpblk_t minblocks)
 {
 	void *bitmap;
 	ext4_grpblk_t next, count = 0;
-	ext4_group_t group;
-	int ret = 0;
+	struct ext4_buddy e4b;
+	int ret;
 
-	BUG_ON(e4b == NULL);
+	ret = ext4_mb_load_buddy(sb, group, &e4b);
+	if (ret) {
+		ext4_error(sb, "Error in loading buddy "
+				"information for %u", group);
+		return ret;
+	}
+	bitmap = e4b.bd_bitmap;
 
-	bitmap = e4b->bd_bitmap;
-	group = e4b->bd_group;
-	start = (e4b->bd_info->bb_first_free > start) ?
-		e4b->bd_info->bb_first_free : start;
 	ext4_lock_group(sb, group);
+	start = (e4b.bd_info->bb_first_free > start) ?
+		e4b.bd_info->bb_first_free : start;
 
 	while (start < max) {
 		start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 		next = mb_find_next_bit(bitmap, max, start);
 
 		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start,
-				next - start, group, e4b);
-			if (ret < 0)
-				break;
+			ext4_trim_extent(sb, start,
+					 next - start, group, &e4b);
 			count += next - start;
 		}
 		start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 			ext4_lock_group(sb, group);
 		}
 
-		if ((e4b->bd_info->bb_free - count) < minblocks)
+		if ((e4b.bd_info->bb_free - count) < minblocks)
 			break;
 	}
 	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
 
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		count, group);
 
-	if (ret < 0)
-		count = ret;
-
 	return count;
 }
 
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
  */
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
-	struct ext4_buddy e4b;
+	struct ext4_group_info *grp;
 	ext4_group_t first_group, last_group;
 	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
 	ext4_grpblk_t cnt = 0, first_block, last_block;
-	uint64_t start, len, minlen, trimmed;
+	uint64_t start, len, minlen, trimmed = 0;
 	ext4_fsblk_t first_data_blk =
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 	int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	start = range->start >> sb->s_blocksize_bits;
 	len = range->len >> sb->s_blocksize_bits;
 	minlen = range->minlen >> sb->s_blocksize_bits;
-	trimmed = 0;
 
 	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
 		return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		return -EINVAL;
 
 	for (group = first_group; group <= last_group; group++) {
-		ret = ext4_mb_load_buddy(sb, group, &e4b);
-		if (ret) {
-			ext4_error(sb, "Error in loading buddy "
-					"information for %u", group);
-			break;
+		grp = ext4_get_group_info(sb, group);
+		/* We only do this if the grp has never been initialized */
+		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+			ret = ext4_mb_init_group(sb, group);
+			if (ret)
+				break;
 		}
 
 		/*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 			last_block = first_block + len;
 		len -= last_block - first_block;
 
-		if (e4b.bd_info->bb_free >= minlen) {
-			cnt = ext4_trim_all_free(sb, &e4b, first_block,
+		if (grp->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, group, first_block,
 						last_block, minlen);
 			if (cnt < 0) {
 				ret = cnt;
-				ext4_mb_unload_buddy(&e4b);
 				break;
 			}
 		}
-		ext4_mb_unload_buddy(&e4b);
 		trimmed += cnt;
 		first_block = 0;
 	}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7..20b5e7b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
 	__u8 ac_op;		/* operation, for history only */
 	struct page *ac_bitmap_page;
 	struct page *ac_buddy_page;
-	/*
-	 * pointer to the held semaphore upon successful
-	 * block allocation
-	 */
-	struct rw_semaphore *alloc_semp;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
 };
@@ -215,7 +210,6 @@ struct ext4_buddy {
 	struct super_block *bd_sb;
 	__u16 bd_blkbits;
 	ext4_group_t bd_group;
-	struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4..b57b98f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
 	 */
-	ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+	ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
 	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
 
 	/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 0000000..9bdef3f
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (!*bh) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+			     mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval && (failed_writes % 60) == 0) {
+			ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		get_random_bytes(&new_seq, sizeof(u32));
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (mmp->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp->mmp_check_interval;
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+			     " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+	retval = write_mmp_block(bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e78..2b8304b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * It needs to call wait_on_page_writeback() to wait for the
 	 * writeback of the page.
 	 */
-	if (PageWriteback(page))
-		wait_on_page_writeback(page);
+	wait_on_page_writeback(page);
 
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b0..b754b77 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
+
+	ext4_handle_dirty_metadata(handle, dir, frame->bh);
+	ext4_handle_dirty_metadata(handle, dir, bh);
+
 	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	dx_release (frames);
-	if (!(de))
+	if (!de) {
+		/*
+		 * Even if the block split failed, we have to properly write
+		 * out all the changes we did so far. Otherwise we can end up
+		 * with corrupted filesystem.
+		 */
+		ext4_mark_inode_dirty(handle, dir);
+		dx_release(frames);
 		return retval;
+	}
+	dx_release(frames);
 
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
 	handle_t *handle;
 	struct inode *inode;
 	int l, err, retries = 0;
+	int credits;
 
 	l = strlen(symname)+1;
 	if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
 
 	dquot_initialize(dir);
 
+	if (l > EXT4_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks.
+		 */
+		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	}
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	handle = ext4_journal_start(dir, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2263,21 +2292,44 @@ retry:
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof(EXT4_I(inode)->i_data)) {
+	if (l > EXT4_N_BLOCKS * 4) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
-		 * page_symlink() calls into ext4_prepare/commit_write.
-		 * We have a transaction open.  All is sweetness.  It also sets
-		 * i_size in generic_commit_write().
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext4_write_begin() which can wait
+		 * for transaction commit if we are running out of space
+		 * and thus we deadlock. So we have to stop transaction now
+		 * and restart it when symlink contents is written.
+		 * 
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
 		 */
+		drop_nlink(inode);
+		err = ext4_orphan_add(handle, inode);
+		ext4_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
 		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext4_journal_start(dir,
+				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		inc_nlink(inode);
+		err = ext4_orphan_del(handle, inode);
 		if (err) {
+			ext4_journal_stop(handle);
 			clear_nlink(inode);
-			unlock_new_inode(inode);
-			ext4_mark_inode_dirty(handle, inode);
-			iput(inode);
-			goto out_stop;
+			goto err_drop_inode;
 		}
 	} else {
 		/* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
 }
 
 static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd05..7bb8f76 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
-		int partial_write = 0;
+		loff_t offset;
+		loff_t io_end_offset;
 
-		head = page_buffers(page);
-		if (error)
+		if (error) {
 			SetPageError(page);
-		BUG_ON(!head);
-		if (head->b_size != PAGE_CACHE_SIZE) {
-			loff_t offset;
-			loff_t io_end_offset = io_end->offset + io_end->size;
+			set_bit(AS_EIO, &page->mapping->flags);
+			head = page_buffers(page);
+			BUG_ON(!head);
+
+			io_end_offset = io_end->offset + io_end->size;
 
 			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
 			bh = head;
 			do {
 				if ((offset >= io_end->offset) &&
-				    (offset+bh->b_size <= io_end_offset)) {
-					if (error)
-						buffer_io_error(bh);
-
-				}
-				if (buffer_delay(bh))
-					partial_write = 1;
-				else if (!buffer_mapped(bh))
-					clear_buffer_dirty(bh);
-				else if (buffer_dirty(bh))
-					partial_write = 1;
+				    (offset+bh->b_size <= io_end_offset))
+					buffer_io_error(bh);
+
 				offset += bh->b_size;
 				bh = bh->b_this_page;
 			} while (bh != head);
 		}
 
-		/*
-		 * If this is a partial write which happened to make
-		 * all buffers uptodate then we can optimize away a
-		 * bogus readpage() for the next read(). Here we
-		 * 'discover' whether the page went uptodate as a
-		 * result of this (potentially partial) write.
-		 */
-		if (!partial_write)
-			SetPageUptodate(page);
-
 		put_io_page(io_end->pages[i]);
 	}
 	io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb..cc5c157 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 
 #include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 
 	if (!test_opt(sb, INIT_INODE_TABLE))
 		seq_puts(seq, ",noinit_inode_table");
-	else if (sbi->s_li_wait_mult)
+	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
 		seq_printf(seq, ",init_inode_table=%u",
 			   (unsigned) sbi->s_li_wait_mult);
 
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
-#endif
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		ext4_msg(sb, KERN_WARNING,
 			 "warning: mounting fs with errors, "
 			 "running e2fsck is recommended");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
 		ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt, sbi->s_mount_opt2);
 
+	cleancache_init_fs(sb);
 	return res;
 }
 
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+				      struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+					struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi,
 					  const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(extent_cache_hits),
+	ATTR_LIST(extent_cache_misses),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 
-static void ext4_lazyinode_timeout(unsigned long data)
-{
-	struct task_struct *p = (struct task_struct *)data;
-	wake_up_process(p);
-}
-
 /* Find next suitable group and run ext4_init_inode_table */
 static int ext4_run_li_request(struct ext4_li_request *elr)
 {
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 		ret = ext4_init_inode_table(sb, group,
 					    elr->lr_timeout ? 0 : 1);
 		if (elr->lr_timeout == 0) {
-			timeout = jiffies - timeout;
-			if (elr->lr_sbi->s_li_wait_mult)
-				timeout *= elr->lr_sbi->s_li_wait_mult;
-			else
-				timeout *= 20;
+			timeout = (jiffies - timeout) *
+				  elr->lr_sbi->s_li_wait_mult;
 			elr->lr_timeout = timeout;
 		}
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 
 /*
  * Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
  */
 static void ext4_remove_li_request(struct ext4_li_request *elr)
 {
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
 
 static void ext4_unregister_li_request(struct super_block *sb)
 {
-	struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
-	if (!ext4_li_info)
+	mutex_lock(&ext4_li_mtx);
+	if (!ext4_li_info) {
+		mutex_unlock(&ext4_li_mtx);
 		return;
+	}
 
 	mutex_lock(&ext4_li_info->li_list_mtx);
-	ext4_remove_li_request(elr);
+	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
 	mutex_unlock(&ext4_li_info->li_list_mtx);
+	mutex_unlock(&ext4_li_mtx);
 }
 
 static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
 	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
-	unsigned long next_wakeup;
-	DEFINE_WAIT(wait);
+	unsigned long next_wakeup, cur;
 
 	BUG_ON(NULL == eli);
 
-	eli->li_timer.data = (unsigned long)current;
-	eli->li_timer.function = ext4_lazyinode_timeout;
-
-	eli->li_task = current;
-	wake_up(&eli->li_wait_task);
-
 cont_thread:
 	while (true) {
 		next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
 		if (freezing(current))
 			refrigerator();
 
-		if ((time_after_eq(jiffies, next_wakeup)) ||
+		cur = jiffies;
+		if ((time_after_eq(cur, next_wakeup)) ||
 		    (MAX_JIFFY_OFFSET == next_wakeup)) {
 			cond_resched();
 			continue;
 		}
 
-		eli->li_timer.expires = next_wakeup;
-		add_timer(&eli->li_timer);
-		prepare_to_wait(&eli->li_wait_daemon, &wait,
-				TASK_INTERRUPTIBLE);
-		if (time_before(jiffies, next_wakeup))
-			schedule();
-		finish_wait(&eli->li_wait_daemon, &wait);
+		schedule_timeout_interruptible(next_wakeup - cur);
+
 		if (kthread_should_stop()) {
 			ext4_clear_request_list();
 			goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
 		goto cont_thread;
 	}
 	mutex_unlock(&eli->li_list_mtx);
-	del_timer_sync(&ext4_li_info->li_timer);
-	eli->li_task = NULL;
-	wake_up(&eli->li_wait_task);
-
 	kfree(ext4_li_info);
-	ext4_lazyinit_task = NULL;
 	ext4_li_info = NULL;
 	mutex_unlock(&ext4_li_mtx);
 
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
 	if (IS_ERR(ext4_lazyinit_task)) {
 		int err = PTR_ERR(ext4_lazyinit_task);
 		ext4_clear_request_list();
-		del_timer_sync(&ext4_li_info->li_timer);
 		kfree(ext4_li_info);
 		ext4_li_info = NULL;
 		printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
 		return err;
 	}
 	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
-	wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
 	return 0;
 }
 
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
 	if (!eli)
 		return -ENOMEM;
 
-	eli->li_task = NULL;
 	INIT_LIST_HEAD(&eli->li_request_list);
 	mutex_init(&eli->li_list_mtx);
 
-	init_waitqueue_head(&eli->li_wait_daemon);
-	init_waitqueue_head(&eli->li_wait_task);
-	init_timer(&eli->li_timer);
 	eli->li_state |= EXT4_LAZYINIT_QUIT;
 
 	ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	int ret = 0;
 
-	if (sbi->s_li_request != NULL)
+	if (sbi->s_li_request != NULL) {
+		/*
+		 * Reset timeout so it can be computed again, because
+		 * s_li_wait_mult might have changed.
+		 */
+		sbi->s_li_request->lr_timeout = 0;
 		return 0;
+	}
 
 	if (first_not_zeroed == ngroups ||
 	    (sb->s_flags & MS_RDONLY) ||
-	    !test_opt(sb, INIT_INODE_TABLE)) {
-		sbi->s_li_request = NULL;
+	    !test_opt(sb, INIT_INODE_TABLE))
 		return 0;
-	}
-
-	if (first_not_zeroed == ngroups) {
-		sbi->s_li_request = NULL;
-		return 0;
-	}
 
 	elr = ext4_li_request_new(sb, first_not_zeroed);
 	if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
 		set_opt(sb, DELALLOC);
 
+	/*
+	 * set default s_li_wait_mult for lazyinit, for the case there is
+	 * no mount option specified.
+	 */
+	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
 			   &journal_devnum, &journal_ioprio, NULL, 0)) {
 		ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended");
 
+	if (IS_EXT2_SB(sb)) {
+		if (ext2_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	if (IS_EXT3_SB(sb)) {
+		if (ext3_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount_wq;
 	} else {
 		clear_opt(sb, DATA_FLAGS);
-		set_opt(sb, WRITEBACK_DATA);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-	int err;
+	int err = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
 			enable_quota = 1;
 		}
 	}
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	u64 fsid;
+	s64 bfree;
 
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+	bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
 		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+	/* prevent underflow in case that few free space is available */
+	buf->f_bfree = max_t(s64, bfree, 0);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
 
+	if (!inode)
+		goto out;
+
 	/* Update modification times of quota files when userspace can
 	 * start looking at them */
 	handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 }
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext2",
-	.mount		= ext4_mount,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
 static inline void register_as_ext2(void)
 {
 	int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
 {
 	unregister_filesystem(&ext2_fs_type);
 }
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
 MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
 {
 	unregister_filesystem(&ext3_fs_type);
 }
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
 MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
 static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	register_as_ext2();
 	register_as_ext3();
+	register_as_ext2();
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1..c757adc 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
-			block = ext4_new_meta_blocks(handle, inode,
-						  goal, NULL, &error);
+			block = ext4_new_meta_blocks(handle, inode, goal, 0,
+						     NULL, &error);
 			if (error)
 				goto cleanup;
 
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b222da..be15437 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
+	dentry_unhash(dentry);
+
 	lock_super(sb);
 	/*
 	 * Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	err = fat_scan(old_dir, old_name, &old_sinfo);
 	if (err) {
 		err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 20b4ea5..c61a678 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
+	dentry_unhash(dentry);
+
 	lock_super(sb);
 
 	err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29..0d0e3fa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	dentry_unhash(entry);
+
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
 	req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	struct fuse_req *req = fuse_get_req(fc);
+
+	if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
+		dentry_unhash(newent);
+
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b1..1cb70cd 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
+	if (S_ISDIR(inode->i_mode))
+		dentry_unhash(dentry);
+
 	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
 		return -ENOTEMPTY;
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
+			dentry_unhash(new_dentry);
+
 		res = hfs_remove(new_dir, new_dentry);
 		if (res)
 			return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059..b288350 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
+	dentry_unhash(dentry);
+
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
 
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode))
+		if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+			dentry_unhash(new_dentry);
 			res = hfsplus_rmdir(new_dir, new_dentry);
-		else
+		} else {
 			res = hfsplus_unlink(new_dir, new_dentry);
+		}
 		if (res)
 			return res;
 	}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834e..e6816b9 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
+	dentry_unhash(dentry);
+
 	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	char *from_name, *to_name;
 	int err;
 
+	if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
+		dentry_unhash(to);
+
 	if ((from_name = dentry_name(from)) == NULL)
 		return -ENOMEM;
 	if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839..ff0ce21 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
 
 		dentry_unhash(dentry);
 		if (!d_unhashed(dentry)) {
-			dput(dentry);
 			hpfs_unlock(dir->i_sb);
 			return -ENOSPC;
 		}
@@ -403,7 +402,6 @@ again:
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
-			dput(dentry);
 		} else {
 			struct iattr newattrs;
 			/*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
 			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 			err = notify_change(dentry, &newattrs);
 			put_write_access(inode);
-			dput(dentry);
 			if (!err)
 				goto again;
 		}
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int r;
 
+	dentry_unhash(dentry);
+
 	hpfs_adjust_length(name, &len);
 	hpfs_lock(dir->i_sb);
 	err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh;
 	struct fnode *fnode;
 	int err;
+
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
 	err = 0;
 	hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e7a0357..7aafeb8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
 	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag,
 				struct user_struct **user, int creat_flags)
 {
 	int error = -ENOMEM;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a8..7f21cf3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
 			ret = err;
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
-		commit_transaction->t_flushed_data_blocks = 1;
 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
 		err = 0;
 	}
 
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_DFLUSH;
+	write_unlock(&journal->j_state_lock);
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
 	 * the commit record
 	 */
-	if (commit_transaction->t_flushed_data_blocks &&
+	if (commit_transaction->t_need_data_flush &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/* Wake up any transactions which were waiting for this
-		   IO to complete */
+		/*
+		 * Wake up any transactions which were waiting for this IO to
+		 * complete. The barrier must be here so that changes by
+		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
+		 * does the waitqueue check.
+		 */
+		smp_mb();
 		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
 		jbd2_journal_abort(journal, err);
 
 	jbd_debug(3, "JBD: commit phase 5\n");
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+	commit_transaction->t_state = T_COMMIT_JFLUSH;
+	write_unlock(&journal->j_state_lock);
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
 
 	jbd_debug(3, "JBD: commit phase 7\n");
 
-	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
 	commit_transaction->t_start = jiffies;
 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db..9a78269 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
 	/*
-	 * Are we already doing a recent enough commit?
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
 	 */
-	if (!tid_geq(journal->j_commit_request, target)) {
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
 		/*
 		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 			  journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
 		return 1;
-	}
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence,
+			  target, journal->j_running_transaction ? 
+			  journal->j_running_transaction->t_tid : 0);
 	return 0;
 }
 
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 }
 
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JBD2_BARRIER))
+		return 0;
+	read_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	commit_trans = journal->j_committing_transaction;
+	if (!commit_trans || commit_trans->t_tid != tid) {
+		ret = 1;
+		goto out;
+	}
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * submitting a flush to fs partition?
+	 */
+	if (journal->j_fs_dev != journal->j_dev) {
+		if (!commit_trans->t_need_data_flush ||
+		    commit_trans->t_state >= T_COMMIT_DFLUSH)
+			goto out;
+	} else {
+		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+			goto out;
+	}
+	ret = 1;
+out:
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
+/*
  * Wait for a specified commit to complete.
  * The caller may not hold the journal lock.
  */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a..3eec82d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  */
 
 /*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
  *
  * In order for t_max_wait to be reliable, it must be protected by a
  * lock.  But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  * means that maximum wait time reported by the jbd2_run_stats
  * tracepoint will always be zero.
  */
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+				     unsigned long ts)
 {
 #ifdef CONFIG_JBD2_DEBUG
-	unsigned long ts = jiffies;
-
 	if (jbd2_journal_enable_debug &&
 	    time_after(transaction->t_start, ts)) {
 		ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 	tid_t		tid;
 	int		needed, need_to_start;
 	int		nblocks = handle->h_buffer_credits;
+	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
 	/* OK, account for the buffers that this operation expects to
 	 * use and add the handle to the running transaction. 
 	 */
-	update_t_max_wait(transaction);
+	update_t_max_wait(transaction, ts);
 	handle->h_transaction = transaction;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
  * This function is visible to journal users (like ext3fs), so is not
  * called with the journal already locked.
  *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
  */
 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	 */
 	JBUFFER_TRACE(jh, "cancelling revoke");
 	jbd2_journal_cancel_revoke(handle, jh);
-	jbd2_journal_put_journal_head(jh);
 out:
+	jbd2_journal_put_journal_head(jh);
 	return err;
 }
 
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	    jinode->i_next_transaction == transaction)
 		goto done;
 
+	/*
+	 * We only ever set this variable to 1 so the test is safe. Since
+	 * t_need_data_flush is likely to be set, we do the test to save some
+	 * cacheline bouncing
+	 */
+	if (!transaction->t_need_data_flush)
+		transaction->t_need_data_flush = 1;
 	/* On some different transaction's list - should be
 	 * the committing one */
 	if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd..05f7332 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	int ret;
 	uint32_t now = get_seconds();
 
+	dentry_unhash(dentry);
+
 	for (fd = f->dents ; fd; fd = fd->next) {
 		if (fd->ino)
 			return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	uint8_t type;
 	uint32_t now;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	/* The VFS will check for us and prevent trying to rename a
 	 * file over a directory and vice versa, but if it's a directory,
 	 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b5..865df16 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
+	dentry_unhash(dentry);
+
 	/* Init inode for quota operations. */
 	dquot_initialize(dip);
 	dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
 		 new_dentry->d_name.name);
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
 
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1..f34c9cd 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
+	dentry_unhash(dentry);
+
 	if (!logfs_empty_dir(inode))
 		return -ENOTEMPTY;
 
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
 	loff_t pos;
 	int err;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	/* 1. locate source dd */
 	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
 	if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f..f60aed8 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
+	dentry_unhash(dentry);
+
 	if (minix_empty_dir(inode)) {
 		err = minix_unlink(dir, dentry);
 		if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct minix_dir_entry * old_de;
 	int err = -ENOENT;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	old_de = minix_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809..fdfae9f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		SetPageMappedToDisk(page);
 	}
 
+	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+	    cleancache_get_page(page) == 0) {
+		SetPageUptodate(page);
+		goto confused;
+	}
+
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
diff --git a/fs/namei.c b/fs/namei.c
index 6ff858c..2358b32 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
  */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-	struct fs_struct *fs = current->fs;
-	struct dentry *dentry = nd->path.dentry;
-	int want_root = 0;
-
-	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-		want_root = 1;
-		spin_lock(&fs->lock);
-		if (nd->root.mnt != fs->root.mnt ||
-				nd->root.dentry != fs->root.dentry)
-			goto err_root;
-	}
-	spin_lock(&dentry->d_lock);
-	if (!__d_rcu_to_refcount(dentry, nd->seq))
-		goto err;
-	BUG_ON(nd->inode != dentry->d_inode);
-	spin_unlock(&dentry->d_lock);
-	if (want_root) {
-		path_get(&nd->root);
-		spin_unlock(&fs->lock);
-	}
-	mntget(nd->path.mnt);
-
-	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
-	nd->flags &= ~LOOKUP_RCU;
-	return 0;
-err:
-	spin_unlock(&dentry->d_lock);
-err_root:
-	if (want_root)
-		spin_unlock(&fs->lock);
-	return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-	if (nd->flags & LOOKUP_RCU)
-		return nameidata_drop_rcu(nd);
-	return 0;
-}
 
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
  * Returns: 0 on success, -ECHILD on failure
  *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
  */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 			goto err_root;
 	}
 	spin_lock(&parent->d_lock);
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	if (!__d_rcu_to_refcount(dentry, nd->seq))
-		goto err;
-	/*
-	 * If the sequence check on the child dentry passed, then the child has
-	 * not been removed from its parent. This means the parent dentry must
-	 * be valid and able to take a reference at this point.
-	 */
-	BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-	BUG_ON(!parent->d_count);
-	parent->d_count++;
-	spin_unlock(&dentry->d_lock);
+	if (!dentry) {
+		if (!__d_rcu_to_refcount(parent, nd->seq))
+			goto err_parent;
+		BUG_ON(nd->inode != parent->d_inode);
+	} else {
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!__d_rcu_to_refcount(dentry, nd->seq))
+			goto err_child;
+		/*
+		 * If the sequence check on the child dentry passed, then
+		 * the child has not been removed from its parent. This
+		 * means the parent dentry must be valid and able to take
+		 * a reference at this point.
+		 */
+		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+		BUG_ON(!parent->d_count);
+		parent->d_count++;
+		spin_unlock(&dentry->d_lock);
+	}
 	spin_unlock(&parent->d_lock);
 	if (want_root) {
 		path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	br_read_unlock(vfsmount_lock);
 	nd->flags &= ~LOOKUP_RCU;
 	return 0;
-err:
+
+err_child:
 	spin_unlock(&dentry->d_lock);
+err_parent:
 	spin_unlock(&parent->d_lock);
 err_root:
 	if (want_root)
@@ -510,59 +468,6 @@ err_root:
 	return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-	if (nd->flags & LOOKUP_RCU) {
-		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
-			nd->flags &= ~LOOKUP_RCU;
-			if (!(nd->flags & LOOKUP_ROOT))
-				nd->root.mnt = NULL;
-			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
-			return -ECHILD;
-		}
-	}
-	return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-	struct dentry *dentry = nd->path.dentry;
-
-	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	nd->flags &= ~LOOKUP_RCU;
-	if (!(nd->flags & LOOKUP_ROOT))
-		nd->root.mnt = NULL;
-	spin_lock(&dentry->d_lock);
-	if (!__d_rcu_to_refcount(dentry, nd->seq))
-		goto err_unlock;
-	BUG_ON(nd->inode != dentry->d_inode);
-	spin_unlock(&dentry->d_lock);
-
-	mntget(nd->path.mnt);
-
-	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
-
-	return 0;
-
-err_unlock:
-	spin_unlock(&dentry->d_lock);
-	rcu_read_unlock();
-	br_read_unlock(vfsmount_lock);
-	return -ECHILD;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
  *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
  */
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
 {
 	struct dentry *dentry = nd->path.dentry;
 	int status;
 
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		spin_lock(&dentry->d_lock);
+		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+		BUG_ON(nd->inode != dentry->d_inode);
+		spin_unlock(&dentry->d_lock);
+		mntget(nd->path.mnt);
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
 		return 0;
 
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
 	if (!status)
 		status = -ESTALE;
 
+	path_put(&nd->path);
 	return status;
 }
 
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
 			return 0;
 unlazy:
-		if (dentry) {
-			if (nameidata_dentry_drop_rcu(nd, dentry))
-				return -ECHILD;
-		} else {
-			if (nameidata_drop_rcu(nd))
-				return -ECHILD;
-		}
+		if (unlazy_walk(nd, dentry))
+			return -ECHILD;
 	} else {
 		dentry = __d_lookup(parent, name);
 	}
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
 		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
 		if (err != -ECHILD)
 			return err;
-		if (nameidata_drop_rcu(nd))
+		if (unlazy_walk(nd, NULL))
 			return -ECHILD;
 	}
 	return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 		return -ENOENT;
 	}
 	if (unlikely(inode->i_op->follow_link) && follow) {
-		if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-			return -ECHILD;
+		if (nd->flags & LOOKUP_RCU) {
+			if (unlikely(unlazy_walk(nd, path->dentry))) {
+				terminate_walk(nd);
+				return -ECHILD;
+			}
+		}
 		BUG_ON(inode != path->dentry->d_inode);
 		return 1;
 	}
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
 		}
 	}
 
-	if (nd->flags & LOOKUP_RCU) {
-		/* went all way through without dropping RCU */
-		BUG_ON(err);
-		if (nameidata_drop_rcu_last(nd))
-			err = -ECHILD;
-	}
-
-	if (!err) {
-		err = handle_reval_path(nd);
-		if (err)
-			path_put(&nd->path);
-	}
+	if (!err)
+		err = complete_walk(nd);
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
 		if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			return ERR_PTR(error);
 		/* fallthrough */
 	case LAST_ROOT:
-		if (nd->flags & LOOKUP_RCU) {
-			if (nameidata_drop_rcu_last(nd))
-				return ERR_PTR(-ECHILD);
-		}
-		error = handle_reval_path(nd);
+		error = complete_walk(nd);
 		if (error)
-			goto exit;
+			return ERR_PTR(error);
 		audit_inode(pathname, nd->path.dentry);
 		if (open_flag & O_CREAT) {
 			error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		goto ok;
 	case LAST_BIND:
-		/* can't be RCU mode here */
-		error = handle_reval_path(nd);
+		error = complete_walk(nd);
 		if (error)
-			goto exit;
+			return ERR_PTR(error);
 		audit_inode(pathname, dir);
 		goto ok;
 	}
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error) /* symlink */
 			return NULL;
 		/* sayonara */
-		if (nd->flags & LOOKUP_RCU) {
-			if (nameidata_drop_rcu_last(nd))
-				return ERR_PTR(-ECHILD);
-		}
+		error = complete_walk(nd);
+		if (error)
+			return ERR_PTR(-ECHILD);
 
 		error = -ENOTDIR;
 		if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	/* create side of things */
-
-	if (nd->flags & LOOKUP_RCU) {
-		if (nameidata_drop_rcu_last(nd))
-			return ERR_PTR(-ECHILD);
-	}
+	error = complete_walk(nd);
+	if (error)
+		return ERR_PTR(error);
 
 	audit_inode(pathname, dir);
 	error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 
 /*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
  *
  * A low-level filesystem can, if it choses, legally
  * do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
  */
 void dentry_unhash(struct dentry *dentry)
 {
-	dget(dentry);
 	shrink_dcache_parent(dentry);
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_count == 2)
+	if (dentry->d_count == 1)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EPERM;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	dentry_unhash(dentry);
+
+	error = -EBUSY;
 	if (d_mountpoint(dentry))
-		error = -EBUSY;
-	else {
-		error = security_inode_rmdir(dir, dentry);
-		if (!error) {
-			error = dir->i_op->rmdir(dir, dentry);
-			if (!error) {
-				dentry->d_inode->i_flags |= S_DEAD;
-				dont_mount(dentry);
-			}
-		}
-	}
+		goto out;
+
+	error = security_inode_rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	error = dir->i_op->rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
+
+out:
 	mutex_unlock(&dentry->d_inode->i_mutex);
-	if (!error) {
+	if (!error)
 		d_delete(dentry);
-	}
-	dput(dentry);
-
 	return error;
 }
 
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *	   we'd better make sure that there's no link(2) for them.
- *	d) some filesystems don't support opened-but-unlinked directories,
- *	   either because of layout or because they are not ready to deal with
- *	   all cases correctly. The latter will be fixed (taking this sort of
- *	   stuff into VFS), but the former is not going away. Solution: the same
- *	   trick as in rmdir().
- *	e) conversion from fhandle to dentry may come in the wrong moment - when
+ *	d) conversion from fhandle to dentry may come in the wrong moment - when
  *	   we are removing the target. Solution: we will have to grab ->i_mutex
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
  *	   ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry)
 {
 	int error = 0;
-	struct inode *target;
+	struct inode *target = new_dentry->d_inode;
 
 	/*
 	 * If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
-	target = new_dentry->d_inode;
 	if (target)
 		mutex_lock(&target->i_mutex);
-	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-		error = -EBUSY;
-	else {
-		if (target)
-			dentry_unhash(new_dentry);
-		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-	}
+
+	error = -EBUSY;
+	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+		goto out;
+
+	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		goto out;
+
 	if (target) {
-		if (!error) {
-			target->i_flags |= S_DEAD;
-			dont_mount(new_dentry);
-		}
-		mutex_unlock(&target->i_mutex);
-		if (d_unhashed(new_dentry))
-			d_rehash(new_dentry);
-		dput(new_dentry);
+		target->i_flags |= S_DEAD;
+		dont_mount(new_dentry);
 	}
+out:
+	if (target)
+		mutex_unlock(&target->i_mutex);
 	if (!error)
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct inode *target;
+	struct inode *target = new_dentry->d_inode;
 	int error;
 
 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	dget(new_dentry);
-	target = new_dentry->d_inode;
 	if (target)
 		mutex_lock(&target->i_mutex);
+
+	error = -EBUSY;
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-		error = -EBUSY;
-	else
-		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-	if (!error) {
-		if (target)
-			dont_mount(new_dentry);
-		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-			d_move(old_dentry, new_dentry);
-	}
+		goto out;
+
+	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		goto out;
+
+	if (target)
+		dont_mount(new_dentry);
+	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+		d_move(old_dentry, new_dentry);
+out:
 	if (target)
 		mutex_unlock(&target->i_mutex);
 	dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf5..fe59bd1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 
 static int flags_to_propagation_type(int flags)
 {
-	int type = flags & ~MS_REC;
+	int type = flags & ~(MS_REC | MS_SILENT);
 
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb..e3e646b 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
+	dentry_unhash(dentry);
+
 	error = -EBUSY;
 	if (!d_unhashed(dentry))
 		goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
 
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b..1102a5f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct nilfs_transaction_info ti;
 	int err;
 
+	dentry_unhash(dentry);
+
 	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
 	if (err)
 		return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct nilfs_transaction_info ti;
 	int err;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
 	if (unlikely(err))
 		return err;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313..f17e58b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
 	namei.o 		\
 	refcounttree.o		\
 	reservations.o		\
+	move_extents.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7..ed553c6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7184,3 +7185,168 @@ out_commit:
 out:
 	return ret;
 }
+
+static int ocfs2_trim_extent(struct super_block *sb,
+			     struct ocfs2_group_desc *gd,
+			     u32 start, u32 count)
+{
+	u64 discard, bcount;
+
+	bcount = ocfs2_clusters_to_blocks(sb, count);
+	discard = le64_to_cpu(gd->bg_blkno) +
+			ocfs2_clusters_to_blocks(sb, start);
+
+	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+			    struct ocfs2_group_desc *gd,
+			    u32 start, u32 max, u32 minbits)
+{
+	int ret = 0, count = 0, next;
+	void *bitmap = gd->bg_bitmap;
+
+	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+		return 0;
+
+	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+			       start, max, minbits);
+
+	while (start < max) {
+		start = ocfs2_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = ocfs2_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minbits) {
+			ret = ocfs2_trim_extent(sb, gd,
+						start, next - start);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+			break;
+	}
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 start, len, trimmed, first_group, last_group, group;
+	int ret, cnt;
+	u32 first_bit, last_bit, minlen;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_dinode *main_bm;
+	struct ocfs2_group_desc *gd = NULL;
+
+	start = range->start >> osb->s_clustersize_bits;
+	len = range->len >> osb->s_clustersize_bits;
+	minlen = range->minlen >> osb->s_clustersize_bits;
+	trimmed = 0;
+
+	if (!len) {
+		range->len = 0;
+		return 0;
+	}
+
+	if (minlen >= osb->bitmap_cpg)
+		return -EINVAL;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (start >= le32_to_cpu(main_bm->i_clusters)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (start + len > le32_to_cpu(main_bm->i_clusters))
+		len = le32_to_cpu(main_bm->i_clusters) - start;
+
+	trace_ocfs2_trim_fs(start, len, minlen);
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+	if (first_group == osb->first_cluster_group_blkno)
+		first_bit = start;
+	else
+		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+	last_bit = osb->bitmap_cpg;
+
+	for (group = first_group; group <= last_group;) {
+		if (first_bit + len >= osb->bitmap_cpg)
+			last_bit = osb->bitmap_cpg;
+		else
+			last_bit = first_bit + len;
+
+		ret = ocfs2_read_group_descriptor(main_bm_inode,
+						  main_bm, group,
+						  &gd_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		brelse(gd_bh);
+		gd_bh = NULL;
+		if (cnt < 0) {
+			ret = cnt;
+			mlog_errno(ret);
+			break;
+		}
+
+		trimmed += cnt;
+		len -= osb->bitmap_cpg - first_bit;
+		first_bit = 0;
+		if (group == osb->first_cluster_group_blkno)
+			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+		else
+			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+	}
+	range->len = trimmed * sb->s_blocksize;
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 0);
+	brelse(main_bm_bh);
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
 		    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702da..a4b0773 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
 	mlog_sys_shutdown();
-	sysfs_remove_link(NULL, "o2cb");
 	kset_unregister(o2cb_kset);
 }
 
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
 	if (!o2cb_kset)
 		return -ENOMEM;
 
-	/*
-	 * Create this symlink for backwards compatibility with old
-	 * versions of ocfs2-tools which look for things in /sys/o2cb.
-	 */
-	ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
-	if (ret)
-		goto error;
-
 	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
 	if (ret)
 		goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7ba..d602abb 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
 	wait_queue_head_t dlm_join_events;
 	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct dlm_recovery_ctxt reco;
 	spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
 	return 1;
 }
 
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+	if (idx == DLM_GRANTED_LIST)
+		return "granted";
+	else if (idx == DLM_CONVERTING_LIST)
+		return "converting";
+	else if (idx == DLM_BLOCKED_LIST)
+		return "blocked";
+	else
+		return "unknown";
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -448,6 +461,7 @@ enum {
 	DLM_FINALIZE_RECO_MSG		= 518,
 	DLM_QUERY_REGION		= 519,
 	DLM_QUERY_NODEINFO		= 520,
+	DLM_BEGIN_EXIT_DOMAIN_MSG	= 521,
 };
 
 struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be..56f82cb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 				 buf + out, len - out);
 	out += snprintf(buf + out, len - out, "\n");
 
+	/* Exit Domain Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+	out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
 	/* Live Map: xx xx xx */
 	out += snprintf(buf + out, len - out, "Live Map: ");
 	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6..6ed6b95 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * New in version 1.1:
  *	- Message DLM_QUERY_REGION added to support global heartbeat
  *	- Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * 	- Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
-	.pv_minor = 1,
+	.pv_minor = 2,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
 			dropped = dlm_empty_lockres(dlm, res);
 
 			spin_lock(&res->spinlock);
-			__dlm_lockres_calc_usage(dlm, res);
-			iter = res->hash_node.next;
+			if (dropped)
+				__dlm_lockres_calc_usage(dlm, res);
+			else
+				iter = res->hash_node.next;
 			spin_unlock(&res->spinlock);
 
 			dlm_lockres_put(res);
 
-			if (dropped)
+			if (dropped) {
+				cond_resched_lock(&dlm->spinlock);
 				goto redo_bucket;
+			}
 		}
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
 	return ret;
 }
 
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+					 void *data, void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+	mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+	spin_lock(&dlm->spinlock);
+	set_bit(node, dlm->exit_domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
 	/* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	spin_lock(&dlm->spinlock);
 	clear_bit(node, dlm->domain_map);
+	clear_bit(node, dlm->exit_domain_map);
 	__dlm_print_nodes(dlm);
 
 	/* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 	return 0;
 }
 
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
 				    unsigned int node)
 {
 	int status;
 	struct dlm_exit_domain leave_msg;
 
-	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
-		  node, dlm->name, dlm->node_num);
+	mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+	     msg_type, node);
 
 	memset(&leave_msg, 0, sizeof(leave_msg));
 	leave_msg.node_idx = dlm->node_num;
 
-	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
-				    &leave_msg, sizeof(leave_msg), node,
-				    NULL);
+	status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+				    sizeof(leave_msg), node, NULL);
 	if (status < 0)
-		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-		     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
-	mlog(0, "status return %d from o2net_send_message\n", status);
+		mlog(ML_ERROR, "Error %d sending domain exit message %u "
+		     "to node %u on domain %s\n", status, msg_type, node,
+		     dlm->name);
 
 	return status;
 }
 
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	/* Support for begin exit domain was added in 1.2 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor < 2)
+		return;
+
+	/*
+	 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+	 * informational. Meaning if a node does not receive the message,
+	 * so be it.
+	 */
+	spin_lock(&dlm->spinlock);
+	while (1) {
+		node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+		if (node >= O2NM_MAX_NODES)
+			break;
+		if (node == dlm->node_num)
+			continue;
+
+		spin_unlock(&dlm->spinlock);
+		dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+		spin_lock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+}
 
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
 
 		clear_node = 1;
 
-		status = dlm_send_one_domain_exit(dlm, node);
+		status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+						  node);
 		if (status < 0 &&
 		    status != -ENOPROTOOPT &&
 		    status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 
 	if (leave) {
 		mlog(0, "shutting down domain %s\n", dlm->name);
+		dlm_begin_exit_domain(dlm);
 
 		/* We changed dlm state, notify the thread */
 		dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 		 * leftover join state. */
 		BUG_ON(dlm->joining_node != assert->node_idx);
 		set_bit(assert->node_idx, dlm->domain_map);
+		clear_bit(assert->node_idx, dlm->exit_domain_map);
 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
 		printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 	if (status)
 		goto bail;
 
+	status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_begin_exit_domain_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
 bail:
 	if (status)
 		dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d1663..11eefb8 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 	dlm_lockres_put(res);
 }
 
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
  */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-				      struct dlm_lock_resource *res,
-				      int *numlocks,
-				      int *hasrefs)
+				      struct dlm_lock_resource *res)
 {
-	int ret;
-	int i;
-	int count = 0;
+	enum dlm_lockres_list idx;
+	int nonlocal = 0, node_ref;
 	struct list_head *queue;
 	struct dlm_lock *lock;
+	u64 cookie;
 
 	assert_spin_locked(&res->spinlock);
 
-	*numlocks = 0;
-	*hasrefs = 0;
-
-	ret = -EINVAL;
-	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
-		mlog(0, "cannot migrate lockres with unknown owner!\n");
-		goto leave;
-	}
-
-	if (res->owner != dlm->node_num) {
-		mlog(0, "cannot migrate lockres this node doesn't own!\n");
-		goto leave;
-	}
+	if (res->owner != dlm->node_num)
+		return 0;
 
-	ret = 0;
-	queue = &res->granted;
-	for (i = 0; i < 3; i++) {
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
 		list_for_each_entry(lock, queue, list) {
-			++count;
-			if (lock->ml.node == dlm->node_num) {
-				mlog(0, "found a lock owned by this node still "
-				     "on the %s queue!  will not migrate this "
-				     "lockres\n", (i == 0 ? "granted" :
-						   (i == 1 ? "converting" :
-						    "blocked")));
-				ret = -ENOTEMPTY;
-				goto leave;
+			if (lock->ml.node != dlm->node_num) {
+				nonlocal++;
+				continue;
 			}
+			cookie = be64_to_cpu(lock->ml.cookie);
+			mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+			     "%s list\n", dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     dlm_get_lock_cookie_node(cookie),
+			     dlm_get_lock_cookie_seq(cookie),
+			     dlm_list_in_text(idx));
+			return 0;
 		}
-		queue++;
 	}
 
-	*numlocks = count;
-
-	count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-	if (count < O2NM_MAX_NODES)
-		*hasrefs = 1;
+	if (!nonlocal) {
+		node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+		if (node_ref >= O2NM_MAX_NODES)
+			return 0;
+	}
 
-	mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
-	     res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
 
-leave:
-	return ret;
+	return 1;
 }
 
 /*
@@ -2406,8 +2396,7 @@ leave:
 
 
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res,
-			       u8 target)
+			       struct dlm_lock_resource *res, u8 target)
 {
 	struct dlm_master_list_entry *mle = NULL;
 	struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	const char *name;
 	unsigned int namelen;
 	int mle_added = 0;
-	int numlocks, hasrefs;
 	int wake = 0;
 
 	if (!dlm_grab(dlm))
 		return -EINVAL;
 
+	BUG_ON(target == O2NM_MAX_NODES);
+
 	name = res->lockname.name;
 	namelen = res->lockname.len;
 
-	mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
-
-	/*
-	 * ensure this lockres is a proper candidate for migration
-	 */
-	spin_lock(&res->spinlock);
-	ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-	if (ret < 0) {
-		spin_unlock(&res->spinlock);
-		goto leave;
-	}
-	spin_unlock(&res->spinlock);
-
-	/* no work to do */
-	if (numlocks == 0 && !hasrefs)
-		goto leave;
-
-	/*
-	 * preallocate up front
-	 * if this fails, abort
-	 */
+	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+	     target);
 
+	/* preallocate up front. if this fails, abort */
 	ret = -ENOMEM;
 	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
 	if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	ret = 0;
 
 	/*
-	 * find a node to migrate the lockres to
-	 */
-
-	spin_lock(&dlm->spinlock);
-	/* pick a new node */
-	if (!test_bit(target, dlm->domain_map) ||
-	    target >= O2NM_MAX_NODES) {
-		target = dlm_pick_migration_target(dlm, res);
-	}
-	mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
-	     namelen, name, target);
-
-	if (target >= O2NM_MAX_NODES ||
-	    !test_bit(target, dlm->domain_map)) {
-		/* target chosen is not alive */
-		ret = -EINVAL;
-	}
-
-	if (ret) {
-		spin_unlock(&dlm->spinlock);
-		goto fail;
-	}
-
-	mlog(0, "continuing with target = %u\n", target);
-
-	/*
 	 * clear any existing master requests and
 	 * add the migration mle to the list
 	 */
+	spin_lock(&dlm->spinlock);
 	spin_lock(&dlm->master_lock);
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
 			dlm_put_mle(mle);
 		} else if (mle) {
 			kmem_cache_free(dlm_mle_cache, mle);
+			mle = NULL;
 		}
 		goto leave;
 	}
@@ -2652,69 +2600,52 @@ leave:
 	if (wake)
 		wake_up(&res->wq);
 
-	/* TODO: cleanup */
 	if (mres)
 		free_page((unsigned long)mres);
 
 	dlm_put(dlm);
 
-	mlog(0, "returning %d\n", ret);
+	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+	     name, target, ret);
 	return ret;
 }
 
 #define DLM_MIGRATION_RETRY_MS  100
 
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
  * There should not be any remaining locks on nonlocal lock resources,
  * and there should be no local locks left on locally mastered resources.
  *
  * Called with the dlm spinlock held, may drop it to do migration, but
  * will re-acquire before exit.
  *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
 	int ret;
 	int lock_dropped = 0;
-	int numlocks, hasrefs;
+	u8 target = O2NM_MAX_NODES;
+
+	assert_spin_locked(&dlm->spinlock);
 
 	spin_lock(&res->spinlock);
-	if (res->owner != dlm->node_num) {
-		if (!__dlm_lockres_unused(res)) {
-			mlog(ML_ERROR, "%s:%.*s: this node is not master, "
-			     "trying to free this but locks remain\n",
-			     dlm->name, res->lockname.len, res->lockname.name);
-		}
-		spin_unlock(&res->spinlock);
-		goto leave;
-	}
+	if (dlm_is_lockres_migrateable(dlm, res))
+		target = dlm_pick_migration_target(dlm, res);
+	spin_unlock(&res->spinlock);
 
-	/* No need to migrate a lockres having no locks */
-	ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-	if (ret >= 0 && numlocks == 0 && !hasrefs) {
-		spin_unlock(&res->spinlock);
+	if (target == O2NM_MAX_NODES)
 		goto leave;
-	}
-	spin_unlock(&res->spinlock);
 
 	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
 	spin_unlock(&dlm->spinlock);
 	lock_dropped = 1;
-	while (1) {
-		ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
-		if (ret >= 0)
-			break;
-		if (ret == -ENOTEMPTY) {
-			mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-		     		res->lockname.len, res->lockname.name);
-			BUG();
-		}
-
-		mlog(0, "lockres %.*s: migrate failed, "
-		     "retrying\n", res->lockname.len,
-		     res->lockname.name);
-		msleep(DLM_MIGRATION_RETRY_MS);
-	}
+	ret = dlm_migrate_lockres(dlm, res, target);
+	if (ret)
+		mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     target, ret);
 	spin_lock(&dlm->spinlock);
 leave:
 	return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 	}
 }
 
-/* for now this is not too intelligent.  we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 				    struct dlm_lock_resource *res)
 {
-	int i;
+	enum dlm_lockres_list idx;
 	struct list_head *queue = &res->granted;
 	struct dlm_lock *lock;
-	int nodenum;
+	int noderef;
+	u8 nodenum = O2NM_MAX_NODES;
 
 	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
 
-	spin_lock(&res->spinlock);
-	for (i=0; i<3; i++) {
+	/* Go through all the locks */
+	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
 		list_for_each_entry(lock, queue, list) {
-			/* up to the caller to make sure this node
-			 * is alive */
-			if (lock->ml.node != dlm->node_num) {
-				spin_unlock(&res->spinlock);
-				return lock->ml.node;
-			}
+			if (lock->ml.node == dlm->node_num)
+				continue;
+			if (test_bit(lock->ml.node, dlm->exit_domain_map))
+				continue;
+			nodenum = lock->ml.node;
+			goto bail;
 		}
-		queue++;
-	}
-
-	nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-	if (nodenum < O2NM_MAX_NODES) {
-		spin_unlock(&res->spinlock);
-		return nodenum;
 	}
-	spin_unlock(&res->spinlock);
-	mlog(0, "have not found a suitable target yet! checking domain map\n");
 
-	/* ok now we're getting desperate.  pick anyone alive. */
-	nodenum = -1;
+	/* Go thru the refmap */
+	noderef = -1;
 	while (1) {
-		nodenum = find_next_bit(dlm->domain_map,
-					O2NM_MAX_NODES, nodenum+1);
-		mlog(0, "found %d in domain map\n", nodenum);
-		if (nodenum >= O2NM_MAX_NODES)
+		noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+					noderef + 1);
+		if (noderef >= O2NM_MAX_NODES)
 			break;
-		if (nodenum != dlm->node_num) {
-			mlog(0, "picking %d\n", nodenum);
-			return nodenum;
-		}
+		if (noderef == dlm->node_num)
+			continue;
+		if (test_bit(noderef, dlm->exit_domain_map))
+			continue;
+		nodenum = noderef;
+		goto bail;
 	}
 
-	mlog(0, "giving up.  no master to migrate to\n");
-	return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+	return nodenum;
 }
 
-
-
 /* this is called by the new master once all lockres
  * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6f..7efab6d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 
 	mlog(0, "node %u being removed from domain map!\n", idx);
 	clear_bit(idx, dlm->domain_map);
+	clear_bit(idx, dlm->exit_domain_map);
 	/* wake up migration waiters if a node goes down.
 	 * perhaps later we can genericize this for other waiters. */
 	wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0ed..b420767 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
  *		  signifies a bast fired on the lock.
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
 					struct kernel_param *kp)
 {
 	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6..b1e35a3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c59..bc91072 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
 
 #include <linux/ext2_fs.h>
 
@@ -35,31 +40,27 @@
  * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
  * just a best-effort to tell userspace that this request caused the error.
  */
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
 					struct ocfs2_info_request __user *req)
 {
 	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
 	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
 
-#define o2info_set_request_error(a, b) \
-		__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
 {
 	req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
 
-#define o2info_set_request_filled(a) \
-		__o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
 	req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
 
-#define o2info_clear_request_filled(a) \
-		__o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+	return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
 
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
 
 	oib.ib_blocksize = inode->i_sb->s_blocksize;
 
-	o2info_set_request_filled(oib);
+	o2info_set_request_filled(&oib.ib_req);
 
 	if (o2info_to_user(oib, req))
 		goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oib, req);
+		o2info_set_request_error(&oib.ib_req, req);
 
 	return status;
 }
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
 
 	oic.ic_clustersize = osb->s_clustersize;
 
-	o2info_set_request_filled(oic);
+	o2info_set_request_filled(&oic.ic_req);
 
 	if (o2info_to_user(oic, req))
 		goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oic, req);
+		o2info_set_request_error(&oic.ic_req, req);
 
 	return status;
 }
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
 
 	oim.im_max_slots = osb->max_slots;
 
-	o2info_set_request_filled(oim);
+	o2info_set_request_filled(&oim.im_req);
 
 	if (o2info_to_user(oim, req))
 		goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oim, req);
+		o2info_set_request_error(&oim.im_req, req);
 
 	return status;
 }
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
 
 	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
 
-	o2info_set_request_filled(oil);
+	o2info_set_request_filled(&oil.il_req);
 
 	if (o2info_to_user(oil, req))
 		goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oil, req);
+		o2info_set_request_error(&oil.il_req, req);
 
 	return status;
 }
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
 
 	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
 
-	o2info_set_request_filled(oiu);
+	o2info_set_request_filled(&oiu.iu_req);
 
 	if (o2info_to_user(oiu, req))
 		goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oiu, req);
+		o2info_set_request_error(&oiu.iu_req, req);
 
 	return status;
 }
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
 	oif.if_incompat_features = osb->s_feature_incompat;
 	oif.if_ro_compat_features = osb->s_feature_ro_compat;
 
-	o2info_set_request_filled(oif);
+	o2info_set_request_filled(&oif.if_req);
 
 	if (o2info_to_user(oif, req))
 		goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oif, req);
+		o2info_set_request_error(&oif.if_req, req);
 
 	return status;
 }
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 
 	oij.ij_journal_size = osb->journal->j_inode->i_size;
 
-	o2info_set_request_filled(oij);
+	o2info_set_request_filled(&oij.ij_req);
 
 	if (o2info_to_user(oij, req))
 		goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oij, req);
+		o2info_set_request_error(&oij.ij_req, req);
+
+	return status;
+}
+
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+				struct inode *inode_alloc, u64 blkno,
+				struct ocfs2_info_freeinode *fi, u32 slot)
+{
+	int status = 0, unlock = 0;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *dinode_alloc = NULL;
+
+	if (inode_alloc)
+		mutex_lock(&inode_alloc->i_mutex);
+
+	if (o2info_coherent(&fi->ifi_req)) {
+		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+	fi->ifi_stat[slot].lfi_total =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+	fi->ifi_stat[slot].lfi_free =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(inode_alloc, 0);
+
+	if (inode_alloc)
+		mutex_unlock(&inode_alloc->i_mutex);
+
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_handle_freeinode(struct inode *inode,
+				struct ocfs2_info_request __user *req)
+{
+	u32 i;
+	u64 blkno = -1;
+	char namebuf[40];
+	int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+	struct ocfs2_info_freeinode *oifi = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *inode_alloc = NULL;
+
+	oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+	if (!oifi) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (o2info_from_user(*oifi, req))
+		goto bail;
+
+	oifi->ifi_slotnum = osb->max_slots;
+
+	for (i = 0; i < oifi->ifi_slotnum; i++) {
+		if (o2info_coherent(&oifi->ifi_req)) {
+			inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+			if (!inode_alloc) {
+				mlog(ML_ERROR, "unable to get alloc inode in "
+				    "slot %u\n", i);
+				status = -EIO;
+				goto bail;
+			}
+		} else {
+			ocfs2_sprintf_system_inode_name(namebuf,
+							sizeof(namebuf),
+							type, i);
+			status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+							    namebuf,
+							    strlen(namebuf),
+							    &blkno);
+			if (status < 0) {
+				status = -ENOENT;
+				goto bail;
+			}
+		}
+
+		status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+		if (status < 0)
+			goto bail;
+
+		iput(inode_alloc);
+		inode_alloc = NULL;
+	}
+
+	o2info_set_request_filled(&oifi->ifi_req);
+
+	if (o2info_to_user(*oifi, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oifi->ifi_req, req);
+
+	kfree(oifi);
+
+	return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+				   unsigned int chunksize)
+{
+	int index;
+
+	index = __ilog2_u32(chunksize);
+	if (index >= OCFS2_INFO_MAX_HIST)
+		index = OCFS2_INFO_MAX_HIST - 1;
+
+	hist->fc_chunks[index]++;
+	hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+			       unsigned int chunksize)
+{
+	if (chunksize > stats->ffs_max)
+		stats->ffs_max = chunksize;
+
+	if (chunksize < stats->ffs_min)
+		stats->ffs_min = chunksize;
+
+	stats->ffs_avg += chunksize;
+	stats->ffs_free_chunks_real++;
+}
+
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+			   unsigned int chunksize)
+{
+	o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+	o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+				   struct inode *gb_inode,
+				   struct ocfs2_dinode *gb_dinode,
+				   struct ocfs2_chain_rec *rec,
+				   struct ocfs2_info_freefrag *ffg,
+				   u32 chunks_in_group)
+{
+	int status = 0, used;
+	u64 blkno;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_group_desc *bg = NULL;
+
+	unsigned int max_bits, num_clusters;
+	unsigned int offset = 0, cluster, chunk;
+	unsigned int chunk_free, last_chunksize = 0;
+
+	if (!le32_to_cpu(rec->c_free))
+		goto bail;
+
+	do {
+		if (!bg)
+			blkno = le64_to_cpu(rec->c_blkno);
+		else
+			blkno = le64_to_cpu(bg->bg_next_group);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		if (o2info_coherent(&ffg->iff_req))
+			status = ocfs2_read_group_descriptor(gb_inode,
+							     gb_dinode,
+							     blkno, &bh);
+		else
+			status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+		if (status < 0) {
+			mlog(ML_ERROR, "Can't read the group descriptor # "
+			     "%llu from device.", (unsigned long long)blkno);
+			status = -EIO;
+			goto bail;
+		}
+
+		bg = (struct ocfs2_group_desc *)bh->b_data;
+
+		if (!le16_to_cpu(bg->bg_free_bits_count))
+			continue;
+
+		max_bits = le16_to_cpu(bg->bg_bits);
+		offset = 0;
+
+		for (chunk = 0; chunk < chunks_in_group; chunk++) {
+			/*
+			 * last chunk may be not an entire one.
+			 */
+			if ((offset + ffg->iff_chunksize) > max_bits)
+				num_clusters = max_bits - offset;
+			else
+				num_clusters = ffg->iff_chunksize;
+
+			chunk_free = 0;
+			for (cluster = 0; cluster < num_clusters; cluster++) {
+				used = ocfs2_test_bit(offset,
+						(unsigned long *)bg->bg_bitmap);
+				/*
+				 * - chunk_free counts free clusters in #N chunk.
+				 * - last_chunksize records the size(in) clusters
+				 *   for the last real free chunk being counted.
+				 */
+				if (!used) {
+					last_chunksize++;
+					chunk_free++;
+				}
+
+				if (used && last_chunksize) {
+					ocfs2_info_update_ffg(ffg,
+							      last_chunksize);
+					last_chunksize = 0;
+				}
+
+				offset++;
+			}
+
+			if (chunk_free == ffg->iff_chunksize)
+				ffg->iff_ffs.ffs_free_chunks++;
+		}
+
+		/*
+		 * need to update the info for last free chunk.
+		 */
+		if (last_chunksize)
+			ocfs2_info_update_ffg(ffg, last_chunksize);
+
+	} while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+				    struct inode *gb_inode, u64 blkno,
+				    struct ocfs2_info_freefrag *ffg)
+{
+	u32 chunks_in_group;
+	int status = 0, unlock = 0, i;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_chain_list *cl = NULL;
+	struct ocfs2_chain_rec *rec = NULL;
+	struct ocfs2_dinode *gb_dinode = NULL;
+
+	if (gb_inode)
+		mutex_lock(&gb_inode->i_mutex);
+
+	if (o2info_coherent(&ffg->iff_req)) {
+		status = ocfs2_inode_lock(gb_inode, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+	cl = &(gb_dinode->id2.i_chain);
+
+	/*
+	 * Chunksize(in) clusters from userspace should be
+	 * less than clusters in a group.
+	 */
+	if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+	ffg->iff_ffs.ffs_min = ~0U;
+	ffg->iff_ffs.ffs_clusters =
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+	ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+	chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+		rec = &(cl->cl_recs[i]);
+		status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+							gb_dinode,
+							rec, ffg,
+							chunks_in_group);
+		if (status)
+			goto bail;
+	}
+
+	if (ffg->iff_ffs.ffs_free_chunks_real)
+		ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+					ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(gb_inode, 0);
+
+	if (gb_inode)
+		mutex_unlock(&gb_inode->i_mutex);
+
+	if (gb_inode)
+		iput(gb_inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_handle_freefrag(struct inode *inode,
+			       struct ocfs2_info_request __user *req)
+{
+	u64 blkno = -1;
+	char namebuf[40];
+	int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+	struct ocfs2_info_freefrag *oiff;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *gb_inode = NULL;
+
+	oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+	if (!oiff) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (o2info_from_user(*oiff, req))
+		goto bail;
+	/*
+	 * chunksize from userspace should be power of 2.
+	 */
+	if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+	    (!oiff->iff_chunksize)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (o2info_coherent(&oiff->iff_req)) {
+		gb_inode = ocfs2_get_system_file_inode(osb, type,
+						       OCFS2_INVALID_SLOT);
+		if (!gb_inode) {
+			mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+			status = -EIO;
+			goto bail;
+		}
+	} else {
+		ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+						OCFS2_INVALID_SLOT);
+		status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+						    namebuf,
+						    strlen(namebuf),
+						    &blkno);
+		if (status < 0) {
+			status = -ENOENT;
+			goto bail;
+		}
+	}
+
+	status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+	if (status < 0)
+		goto bail;
+
+	o2info_set_request_filled(&oiff->iff_req);
+
+	if (o2info_to_user(*oiff, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oiff->iff_req, req);
+
+	kfree(oiff);
 
 	return status;
 }
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
 	if (o2info_from_user(oir, req))
 		goto bail;
 
-	o2info_clear_request_filled(oir);
+	o2info_clear_request_filled(&oir);
 
 	if (o2info_to_user(oir, req))
 		goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
 	status = 0;
 bail:
 	if (status)
-		o2info_set_request_error(oir, req);
+		o2info_set_request_error(&oir, req);
 
 	return status;
 }
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
 		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
 			status = ocfs2_info_handle_journal_size(inode, req);
 		break;
+	case OCFS2_INFO_FREEINODE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+			status = ocfs2_info_handle_freeinode(inode, req);
+		break;
+	case OCFS2_INFO_FREEFRAG:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+			status = ocfs2_info_handle_freefrag(inode, req);
+		break;
 	default:
 		status = ocfs2_info_handle_unknown(inode, req);
 		break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 0);
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		ret = ocfs2_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case OCFS2_IOC_MOVE_EXT:
+		return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+	case FITRIM:
 		break;
 	case OCFS2_IOC_REFLINK:
 		if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 1);
+	case OCFS2_IOC_MOVE_EXT:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 0000000..4c54884
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+	struct inode *inode;
+	struct file *file;
+	int auto_defrag;
+	int partial;
+	int credits;
+	u32 new_phys_cpos;
+	u32 clusters_moved;
+	u64 refcount_loc;
+	struct ocfs2_move_extents *range;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+			       struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+			       int ext_flags)
+{
+	int ret = 0, index;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_rec *rec, replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+	ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+					       p_cpos, new_p_cpos, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+								   new_p_cpos));
+
+	path = ocfs2_new_path_from_et(&context->et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	rec = &el->l_recs[index];
+
+	BUG_ON(ext_flags != rec->e_flags);
+	/*
+	 * after moving/defraging to new location, the extent is not going
+	 * to be refcounted anymore.
+	 */
+	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+				      context->et.et_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, &context->et, path, index,
+				 &replace_rec, context->meta_ac,
+				 &context->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+	context->new_phys_cpos = new_p_cpos;
+
+	/*
+	 * need I to append truncate log for old clusters?
+	 */
+	if (old_blkno) {
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 old_blkno),
+					len, context->meta_ac,
+					&context->dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							old_blkno, len);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					u32 clusters_to_move,
+					u32 extents_to_split,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int extra_blocks,
+					int *credits)
+{
+	int ret, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+					      clusters_to_move + 2);
+
+	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+	     extra_blocks, clusters_to_move, *credits);
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 new_phys_cpos, new_len;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							*len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+						 &context->meta_ac,
+						 &context->data_ac,
+						 extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * should be using allocation reservation strategy there?
+	 *
+	 * if (context->data_ac)
+	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+	 */
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock_mutex;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_mutex;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+				     &new_phys_cpos, &new_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * allowing partial extent moving is kind of 'pros and cons', it makes
+	 * whole defragmentation less likely to fail, on the contrary, the bad
+	 * thing is it may make the fs even more fragmented after moving, let
+	 * userspace make a good decision here.
+	 */
+	if (new_len != *len) {
+		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+		if (!partial) {
+			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+			ret = -ENOSPC;
+			goto out_commit;
+		}
+	}
+
+	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+	     phys_cpos, new_phys_cpos);
+
+	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+				  new_phys_cpos, ext_flags);
+	if (ret)
+		mlog_errno(ret);
+
+	if (partial && (new_len != *len))
+		*len = new_len;
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+					 u64 vict_blkno,
+					 int type, int slot,
+					 int *vict_bit,
+					 struct buffer_head **ret_bh)
+{
+	int ret, i, blocks_per_unit = 1;
+	u64 blkno;
+	char namebuf[40];
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *rec;
+	struct ocfs2_dinode *ac_dinode;
+	struct ocfs2_group_desc *bg;
+
+	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					 strlen(namebuf), &blkno);
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+	cl = &(ac_dinode->id2.i_chain);
+	rec = &(cl->cl_recs[0]);
+
+	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+		blocks_per_unit <<= (osb->s_clustersize_bits -
+						inode->i_sb->s_blocksize_bits);
+	/*
+	 * 'vict_blkno' was out of the valid range.
+	 */
+	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+				blocks_per_unit))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+		rec = &(cl->cl_recs[i]);
+		if (!rec)
+			continue;
+
+		bg = NULL;
+
+		do {
+			if (!bg)
+				blkno = le64_to_cpu(rec->c_blkno);
+			else
+				blkno = le64_to_cpu(bg->bg_next_group);
+
+			if (gd_bh) {
+				brelse(gd_bh);
+				gd_bh = NULL;
+			}
+
+			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+						le16_to_cpu(bg->bg_bits))) {
+
+				*ret_bh = gd_bh;
+				*vict_bit = (vict_blkno - blkno) /
+							blocks_per_unit;
+				mlog(0, "find the victim group: #%llu, "
+				     "total_bits: %u, vict_bit: %u\n",
+				     blkno, le16_to_cpu(bg->bg_bits),
+				     *vict_bit);
+				goto out;
+			}
+
+		} while (le64_to_cpu(bg->bg_next_group));
+	}
+
+	ret = -EINVAL;
+out:
+	brelse(ac_bh);
+
+	/*
+	 * caller has to release the gd_bh properly.
+	 */
+	return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+					       struct ocfs2_move_extents *range)
+{
+	int ret, goal_bit = 0;
+
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int c_to_b = 1 << (osb->s_clustersize_bits -
+					inode->i_sb->s_blocksize_bits);
+
+	/*
+	 * validate goal sits within global_bitmap, and return the victim
+	 * group desc
+	 */
+	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret)
+		goto out;
+
+	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+	/*
+	 * make goal become cluster aligned.
+	 */
+	if (range->me_goal % c_to_b)
+		range->me_goal = range->me_goal / c_to_b * c_to_b;
+
+	/*
+	 * moving goal is not allowd to start with a group desc blok(#0 blk)
+	 * let's compromise to the latter cluster.
+	 */
+	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+		range->me_goal += c_to_b;
+
+	/*
+	 * movement is not gonna cross two groups.
+	 */
+	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+								range->me_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * more exact validations/adjustments will be performed later during
+	 * moving operation for each extent range.
+	 */
+	mlog(0, "extents get ready to be moved to #%llu block\n",
+	     range->me_goal);
+
+out:
+	brelse(gd_bh);
+
+	return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+				    int *goal_bit, u32 move_len, u32 max_hop,
+				    u32 *phys_cpos)
+{
+	int i, used, last_free_bits = 0, base_bit = *goal_bit;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+						 le64_to_cpu(gd->bg_blkno));
+
+	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+		if (used) {
+			/*
+			 * we even tried searching the free chunk by jumping
+			 * a 'max_hop' distance, but still failed.
+			 */
+			if ((i - base_bit) > max_hop) {
+				*phys_cpos = 0;
+				break;
+			}
+
+			if (last_free_bits)
+				last_free_bits = 0;
+
+			continue;
+		} else
+			last_free_bits++;
+
+		if (last_free_bits == move_len) {
+			*goal_bit = i;
+			*phys_cpos = base_cpos + i;
+			break;
+		}
+	}
+
+	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+				       handle_t *handle,
+				       struct buffer_head *di_bh,
+				       u32 num_bits,
+				       u16 chain)
+{
+	int ret;
+	u32 tmp_used;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_chain_list *cl =
+				(struct ocfs2_chain_list *) &di->id2.i_chain;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out:
+	return ret;
+}
+
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 group_bh,
+					 journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+	while (num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+	return status;
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+			     u32 len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *gb_inode = NULL;
+	struct buffer_head *gb_bh = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *gd;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+						    context->range->me_threshold);
+	u64 phys_blkno, new_phys_blkno;
+
+	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+						 &context->meta_ac,
+						 NULL, extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * need to count 2 extra credits for global_bitmap inode and
+	 * group descriptor.
+	 */
+	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+	/*
+	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+	 * logic, while we still need to lock the global_bitmap.
+	 */
+	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					       OCFS2_INVALID_SLOT);
+	if (!gb_inode) {
+		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	mutex_lock(&gb_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_gb_mutex;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_tl_inode;
+	}
+
+	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * probe the victim cluster group to find a proper
+	 * region to fit wanted movement, it even will perfrom
+	 * a best-effort attempt by compromising to a threshold
+	 * around the goal.
+	 */
+	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+				new_phys_cpos);
+	if (!new_phys_cpos) {
+		ret = -ENOSPC;
+		goto out_commit;
+	}
+
+	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+				  *new_phys_cpos, ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+					       le16_to_cpu(gd->bg_chain));
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+					 goal_bit, len);
+	if (ret)
+		mlog_errno(ret);
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	brelse(gd_bh);
+
+out_unlock_tl_inode:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+	mutex_unlock(&gb_inode->i_mutex);
+	brelse(gb_bh);
+	iput(gb_inode);
+
+out:
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+					 u32 threshold, int *skip)
+{
+	if ((*alloc_size + *len_defraged) < threshold) {
+		/*
+		 * proceed defragmentation until we meet the thresh
+		 */
+		*len_defraged += *alloc_size;
+	} else if (*len_defraged == 0) {
+		/*
+		 * XXX: skip a large extent.
+		 */
+		*skip = 1;
+	} else {
+		/*
+		 * split this extent to coalesce with former pieces as
+		 * to reach the threshold.
+		 *
+		 * we're done here with one cycle of defragmentation
+		 * in a size of 'thresh', resetting 'len_defraged'
+		 * forces a new defragmentation.
+		 */
+		*alloc_size = threshold - *len_defraged;
+		*len_defraged = 0;
+	}
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+				struct ocfs2_move_extents_context *context)
+{
+	int ret = 0, flags, do_defrag, skip = 0;
+	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_move_extents *range = context->range;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if ((inode->i_size == 0) || (range->me_len == 0))
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+	/*
+	 * TO-DO XXX:
+	 *
+	 * - xattr extents.
+	 */
+
+	do_defrag = context->auto_defrag;
+
+	/*
+	 * extents moving happens in unit of clusters, for the sake
+	 * of simplicity, we may ignore two clusters where 'byte_start'
+	 * and 'byte_start + len' were within.
+	 */
+	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+	len_to_move = (range->me_start + range->me_len) >>
+						osb->s_clustersize_bits;
+	if (len_to_move >= move_start)
+		len_to_move -= move_start;
+	else
+		len_to_move = 0;
+
+	if (do_defrag) {
+		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+		if (defrag_thresh <= 1)
+			goto done;
+	} else
+		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+							 range->me_goal);
+
+	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+	     "thresh: %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)range->me_start,
+	     (unsigned long long)range->me_len,
+	     move_start, len_to_move, defrag_thresh);
+
+	cpos = move_start;
+	while (len_to_move) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+					 &flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > len_to_move)
+			alloc_size = len_to_move;
+
+		/*
+		 * XXX: how to deal with a hole:
+		 *
+		 * - skip the hole of course
+		 * - force a new defragmentation
+		 */
+		if (!phys_cpos) {
+			if (do_defrag)
+				len_defraged = 0;
+
+			goto next;
+		}
+
+		if (do_defrag) {
+			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+						     defrag_thresh, &skip);
+			/*
+			 * skip large extents
+			 */
+			if (skip) {
+				skip = 0;
+				goto next;
+			}
+
+			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+			     "alloc_size: %u, len_defraged: %u\n",
+			     cpos, phys_cpos, alloc_size, len_defraged);
+
+			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+						  &alloc_size, flags);
+		} else {
+			ret = ocfs2_move_extent(context, cpos, phys_cpos,
+						&new_phys_cpos, alloc_size,
+						flags);
+
+			new_phys_cpos += alloc_size;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		context->clusters_moved += alloc_size;
+next:
+		cpos += alloc_size;
+		len_to_move -= alloc_size;
+	}
+
+done:
+	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+						      context->clusters_moved);
+	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+						       context->new_phys_cpos);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &context->dealloc);
+
+	return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+	int status;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!inode)
+		return -ENOENT;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * This prevents concurrent writes from other nodes
+	 */
+	status = ocfs2_rw_lock(inode, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out_rw_unlock;
+	}
+
+	/*
+	 * rememer ip_xattr_sem also needs to be held if necessary
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	status = __ocfs2_move_extents_range(di_bh, context);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (status) {
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * We update ctime for these changes
+	 */
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+	ocfs2_rw_unlock(inode, 1);
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+	int status;
+
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ocfs2_move_extents range;
+	struct ocfs2_move_extents_context *context = NULL;
+
+	status = mnt_want_write(filp->f_path.mnt);
+	if (status)
+		return status;
+
+	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+		goto out;
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		status = -EPERM;
+		goto out;
+	}
+
+	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+	if (!context) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->file = filp;
+
+	if (argp) {
+		if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+				   sizeof(range))) {
+			status = -EFAULT;
+			goto out;
+		}
+	} else {
+		status = -EINVAL;
+		goto out;
+	}
+
+	if (range.me_start > i_size_read(inode))
+		goto out;
+
+	if (range.me_start + range.me_len > i_size_read(inode))
+			range.me_len = i_size_read(inode) - range.me_start;
+
+	context->range = &range;
+
+	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+		context->auto_defrag = 1;
+		/*
+		 * ok, the default theshold for the defragmentation
+		 * is 1M, since our maximum clustersize was 1M also.
+		 * any thought?
+		 */
+		if (!range.me_threshold)
+			range.me_threshold = 1024 * 1024;
+
+		if (range.me_threshold > i_size_read(inode))
+			range.me_threshold = i_size_read(inode);
+
+		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+			context->partial = 1;
+	} else {
+		/*
+		 * first best-effort attempt to validate and adjust the goal
+		 * (physical address in block), while it can't guarantee later
+		 * operation can succeed all the time since global_bitmap may
+		 * change a bit over time.
+		 */
+
+		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+		if (status)
+			goto out;
+	}
+
+	status = ocfs2_move_extents(context);
+	if (status)
+		mlog_errno(status);
+out:
+	/*
+	 * movement/defragmentation may end up being partially completed,
+	 * that's the reason why we need to return userspace the finished
+	 * length and new_offset even if failure happens somewhere.
+	 */
+	if (argp) {
+		if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+				sizeof(range)))
+			status = -EFAULT;
+	}
+
+	kfree(context);
+
+	mnt_drop_write(filp->f_path.mnt);
+
+	return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 0000000..4e143e8
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39b..5b27ff1 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
 	__u64 ij_journal_size;
 };
 
+struct ocfs2_info_freeinode {
+	struct ocfs2_info_request ifi_req;
+	struct ocfs2_info_local_freeinode {
+		__u64 lfi_total;
+		__u64 lfi_free;
+	} ifi_stat[OCFS2_MAX_SLOTS];
+	__u32 ifi_slotnum; /* out */
+	__u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST     (32)
+
+struct ocfs2_info_freefrag {
+	struct ocfs2_info_request iff_req;
+	struct ocfs2_info_freefrag_stats { /* (out) */
+		struct ocfs2_info_free_chunk_list {
+			__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+			__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+		} ffs_fc_hist;
+		__u32 ffs_clusters;
+		__u32 ffs_free_clusters;
+		__u32 ffs_free_chunks;
+		__u32 ffs_free_chunks_real;
+		__u32 ffs_min; /* Minimum free chunksize in clusters */
+		__u32 ffs_max;
+		__u32 ffs_avg;
+		__u32 ffs_pad;
+	} iff_ffs;
+	__u32 iff_chunksize; /* chunksize in clusters(in) */
+	__u32 iff_pad;
+};
+
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
 	OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
 	OCFS2_INFO_UUID,
 	OCFS2_INFO_FS_FEATURES,
 	OCFS2_INFO_JOURNAL_SIZE,
+	OCFS2_INFO_FREEINODE,
+	OCFS2_INFO_FREEFRAG,
 	OCFS2_INFO_NUM_TYPES
 };
 
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
 
 #define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
 
+struct ocfs2_move_extents {
+/* All values are in bytes */
+	/* in */
+	__u64 me_start;		/* Virtual start in the file to move */
+	__u64 me_len;		/* Length of the extents to be moved */
+	__u64 me_goal;		/* Physical offset of the goal,
+				   it's in block unit */
+	__u64 me_threshold;	/* Maximum distance from goal or threshold
+				   for auto defragmentation */
+	__u64 me_flags;		/* Flags for the operation:
+				 * - auto defragmentation.
+				 * - refcount,xattr cases.
+				 */
+	/* out */
+	__u64 me_moved_len;	/* Moved/defraged length */
+	__u64 me_new_offset;	/* Resulting physical location */
+	__u32 me_reserved[2];	/* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG	(0x00000001)	/* Kernel manages to
+							   claim new clusters
+							   as the goal place
+							   for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG	(0x00000002)	/* Allow partial extent
+							   moving, is to make
+							   movement less likely
+							   to fail, may make fs
+							   even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmenation
+							   completely gets done.
+							 */
+
+#define OCFS2_IOC_MOVE_EXT	_IOW('o', 6, struct ocfs2_move_extents)
+
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5b..3b481f4 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
 		  __entry->blkno, __entry->bit)
 );
 
+TRACE_EVENT(ocfs2_trim_extent,
+	TP_PROTO(struct super_block *sb, unsigned long long blk,
+		 unsigned long long count),
+	TP_ARGS(sb, blk, count),
+	TP_STRUCT__entry(
+		__field(int, dev_major)
+		__field(int, dev_minor)
+		__field(unsigned long long, blk)
+		__field(__u64,	count)
+	),
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(sb->s_dev);
+		__entry->dev_minor = MINOR(sb->s_dev);
+		__entry->blk = blk;
+		__entry->count = count;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->dev_major, __entry->dev_minor,
+		  __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
 /* End of trace events for fs/ocfs2/alloc.c. */
 
 /* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606c..ebfd382 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
 			    u32 *num_clusters,
 			    unsigned int *extent_flags);
 	int (*cow_duplicate_clusters)(handle_t *handle,
-				      struct ocfs2_cow_context *context,
+				      struct file *file,
 				      u32 cpos, u32 old_cluster,
 				      u32 new_cluster, u32 new_len);
 };
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-					    struct ocfs2_cow_context *context,
-					    u32 cpos, u32 old_cluster,
-					    u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct file *file,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len)
 {
 	int ret = 0, partial;
-	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
 	pgoff_t page_index;
 	unsigned int from, to, readahead_pages;
 	loff_t offset, end, map_end;
-	struct address_space *mapping = context->inode->i_mapping;
+	struct address_space *mapping = inode->i_mapping;
 
 	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
 					       new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	 * We only duplicate pages until we reach the page contains i_size - 1.
 	 * So trim 'end' to i_size.
 	 */
-	if (end > i_size_read(context->inode))
-		end = i_size_read(context->inode);
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
 
 	while (offset < end) {
 		page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
 			BUG_ON(PageDirty(page));
 
-		if (PageReadahead(page) && context->file) {
+		if (PageReadahead(page)) {
 			page_cache_async_readahead(mapping,
-						   &context->file->f_ra,
-						   context->file,
+						   &file->f_ra, file,
 						   page, page_index,
 						   readahead_pages);
 		}
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 			}
 		}
 
-		ocfs2_map_and_dirty_page(context->inode,
-					 handle, from, to,
+		ocfs2_map_and_dirty_page(inode, handle, from, to,
 					 page, 0, &new_block);
 		mark_page_accessed(page);
 unlock:
@@ -3015,14 +3014,15 @@ unlock:
 	return ret;
 }
 
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-					   struct ocfs2_cow_context *context,
-					   u32 cpos, u32 old_cluster,
-					   u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct file *file,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
 {
 	int ret = 0;
-	struct super_block *sb = context->inode->i_sb;
-	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
 	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
 	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = context->cow_duplicate_clusters(handle, context, cpos,
-						      old, new, len);
+		ret = context->cow_duplicate_clusters(handle, context->file,
+						      cpos, old, new, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3162,22 +3162,22 @@ out:
 	return ret;
 }
 
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
-				    struct ocfs2_cow_context *context,
-				    u32 cpos, u32 num_clusters)
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters)
 {
 	int ret = 0;
 	loff_t offset, end, map_end;
 	pgoff_t page_index;
 	struct page *page;
 
-	if (ocfs2_should_order_data(context->inode))
+	if (ocfs2_should_order_data(inode))
 		return 0;
 
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
 
-	ret = filemap_fdatawrite_range(context->inode->i_mapping,
+	ret = filemap_fdatawrite_range(inode->i_mapping,
 				       offset, end - 1);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
 		if (map_end > end)
 			map_end = end;
 
-		page = find_or_create_page(context->inode->i_mapping,
+		page = find_or_create_page(inode->i_mapping,
 					   page_index, GFP_NOFS);
 		BUG_ON(!page);
 
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	 * in write-back mode.
 	 */
 	if (context->get_clusters == ocfs2_di_get_clusters) {
-		ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+		ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
 					       orig_num_clusters);
 		if (ret)
 			mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7..7754608 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 			     struct buffer_head *ref_root_bh,
 			     u32 cpos, u32 write_len,
 			     struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct file *file,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct file *file,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
 			    struct ocfs2_extent_tree *data_et,
 			    struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5a521c7..cdbaf5e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/cleancache.h>
 
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
@@ -1566,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
 		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 
-	if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+	if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
 		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
 	if (osb->osb_commit_interval)
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		mlog_errno(status);
 		goto bail;
 	}
+	cleancache_init_shared_fs((char *)&uuid_net_key, sb);
 
 bail:
 	return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29..c368360c 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int ret;
 
-	if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
-		return -ENOTEMPTY;
+
+	if (S_ISDIR(inode->i_mode)) {
+		dentry_unhash(dentry);
+		if (!omfs_dir_is_empty(inode))
+			return -ENOTEMPTY;
+	}
 
 	ret = omfs_delete_entry(dentry);
 	if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err;
 
 	if (new_inode) {
+		if (S_ISDIR(new_inode->i_mode))
+			dentry_unhash(new_dentry);
+
 		/* overwriting existing file/dir */
 		err = omfs_remove(new_dir, new_dentry);
 		if (err)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2c9db29..db15935 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
-	int flags = vma->vm_flags;
+	vm_flags_t flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
 	unsigned long start, end;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 1186626..76c8164 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 	INITIALIZE_PATH(path);
 	struct reiserfs_dir_entry de;
 
+	dentry_unhash(dentry);
+
 	/* we will be doing 2 balancings and update 2 stat data, we change quotas
 	 * of the owner of the directory and of the owner of the parent directory.
 	 * The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	unsigned long savelink = 1;
 	struct timespec ctime;
 
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	/* three balancings: (1) old name removal, (2) new name insertion
 	   and (3) maybe "save" link insertion
 	   stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a44..50f1abc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	if (!error)
 		d_delete(dentry);
-	dput(dentry);
 
 	return error;
 }
diff --git a/fs/super.c b/fs/super.c
index c04f7e0..c755939 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		s->cleancache_poolid = -1;
 	}
 out:
 	return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
+		cleancache_flush_fs(s);
 		fs->kill_sb(s);
 		/*
 		 * We need to call rcu_barrier so all the delayed rcu free
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbc..e2cc675 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
+	dentry_unhash(dentry);
+
 	if (sysv_empty_dir(inode)) {
 		err = sysv_unlink(dir, dentry);
 		if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 	struct sysv_dir_entry * old_de;
 	int err = -ENOENT;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	old_de = sysv_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef5abd3..c2b8094 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
 
+	dentry_unhash(dentry);
+
 	/*
 	 * Budget request settings: deletion direntry, deletion inode and
 	 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	/*
 	 * Budget request settings: deletion direntry, new direntry, removing
 	 * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce84..4d76594 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fileIdentDesc *fi, cfi;
 	struct kernel_lb_addr tloc;
 
+	dentry_unhash(dentry);
+
 	retval = -ENOENT;
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e2..953ebdf 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err= -ENOTEMPTY;
 
+	dentry_unhash(dentry);
+
 	lock_ufs(dir->i_sb);
 	if (ufs_empty_dir (inode)) {
 		err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
+	if (new_inode && S_ISDIR(new_inode->i_mode))
+		dentry_unhash(new_dentry);
+
 	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c..244e797 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
 		return -XFS_ERROR(EFAULT);
 	return 0;
 }
+
+int
+xfs_discard_extents(
+	struct xfs_mount	*mp,
+	struct list_head	*list)
+{
+	struct xfs_busy_extent	*busyp;
+	int			error = 0;
+
+	list_for_each_entry(busyp, list, list) {
+		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+					 busyp->length);
+
+		error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, 0);
+		if (error && error != EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for extent [0x%llu,%u], error %d",
+				 (unsigned long long)busyp->bno,
+				 busyp->length,
+				 error);
+			return error;
+		}
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd..344879a 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
 #define XFS_DISCARD_H 1
 
 struct fstrim_range;
+struct list_head;
 
 extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
 
 #endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b0aa59e..98b9c91 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
-#define MNTOPT_DELAYLOG   "delaylog"	/* Delayed loging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog"	/* Delayed loging disabled */
+#define MNTOPT_DELAYLOG    "delaylog"	/* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog"	/* Delayed logging disabled */
+#define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
 
 /*
  * Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+		} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+			mp->m_flags |= XFS_MOUNT_DISCARD;
+		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+			mp->m_flags &= ~XFS_MOUNT_DISCARD;
 		} else if (!strcmp(this_char, "ihashsize")) {
 			xfs_warn(mp,
 	"ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
 		return EINVAL;
 	}
 
+	if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+	    !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+		xfs_warn(mp,
+	"the discard option is incompatible with the nodelaylog option");
+		return EINVAL;
+	}
+
 #ifndef CONFIG_XFS_QUOTA
 	if (XFS_IS_QUOTA_RUNNING(mp)) {
 		xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
 		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
+		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index da0a561..6530769 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,6 +187,9 @@ struct xfs_busy_extent {
 	xfs_agnumber_t	agno;
 	xfs_agblock_t	bno;
 	xfs_extlen_t	length;
+	unsigned int	flags;
+#define XFS_ALLOC_BUSY_DISCARDED	0x01	/* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD	0x02	/* do not discard */
 };
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index acdced8..95862bb 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
 
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 	if (!error)
-		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
+		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
 error0:
 	xfs_perag_put(args.pag);
 	return error;
@@ -2480,7 +2480,8 @@ xfs_alloc_busy_insert(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		bno,
-	xfs_extlen_t		len)
+	xfs_extlen_t		len,
+	unsigned int		flags)
 {
 	struct xfs_busy_extent	*new;
 	struct xfs_busy_extent	*busyp;
@@ -2504,6 +2505,7 @@ xfs_alloc_busy_insert(
 	new->bno = bno;
 	new->length = len;
 	INIT_LIST_HEAD(&new->list);
+	new->flags = flags;
 
 	/* trace before insert to be able to see failed inserts */
 	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2609,6 +2611,18 @@ xfs_alloc_busy_update_extent(
 	xfs_agblock_t		bend = bbno + busyp->length;
 
 	/*
+	 * This extent is currently being discarded.  Give the thread
+	 * performing the discard a chance to mark the extent unbusy
+	 * and retry.
+	 */
+	if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+		spin_unlock(&pag->pagb_lock);
+		delay(1);
+		spin_lock(&pag->pagb_lock);
+		return false;
+	}
+
+	/*
 	 * If there is a busy extent overlapping a user allocation, we have
 	 * no choice but to force the log and retry the search.
 	 *
@@ -2813,7 +2827,8 @@ restart:
 		 * If this is a metadata allocation, try to reuse the busy
 		 * extent instead of trimming the allocation.
 		 */
-		if (!args->userdata) {
+		if (!args->userdata &&
+		    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
 			if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
 							  busyp, fbno, flen,
 							  false))
@@ -2979,10 +2994,16 @@ xfs_alloc_busy_clear_one(
 	kmem_free(busyp);
 }
 
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
 void
 xfs_alloc_busy_clear(
 	struct xfs_mount	*mp,
-	struct list_head	*list)
+	struct list_head	*list,
+	bool			do_discard)
 {
 	struct xfs_busy_extent	*busyp, *n;
 	struct xfs_perag	*pag = NULL;
@@ -2999,7 +3020,11 @@ xfs_alloc_busy_clear(
 			agno = busyp->agno;
 		}
 
-		xfs_alloc_busy_clear_one(mp, pag, busyp);
+		if (do_discard && busyp->length &&
+		    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+			busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+		else
+			xfs_alloc_busy_clear_one(mp, pag, busyp);
 	}
 
 	if (pag) {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 240ad28..2f52b92 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,10 +137,11 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
 xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-	xfs_agblock_t bno, xfs_extlen_t len);
+	xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
 
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+	bool do_discard);
 
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 8b469d5..2b35188 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -120,7 +120,8 @@ xfs_allocbt_free_block(
 	if (error)
 		return error;
 
-	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+			      XFS_ALLOC_BUSY_SKIP_DISCARD);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
 	return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788..e546a33 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
 	int			*flags);	/* inode logging flags */
 
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int				/* error */
-xfs_bmap_add_extent(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork, /* data or attr fork */
-	int			rsvd);	/* OK to allocate reserved blocks */
-
-/*
  * Called by xfs_bmap_add_extent to handle cases converting a delayed
  * allocation to a real allocation.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			rsvd);	/* OK to allocate reserved blocks */
+	int			*logflagsp); /* inode logging flags */
 
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp,/* inode logging flags */
-	int			rsvd);	/* OK to allocate reserved blocks */
+	int			*logflagsp); /* inode logging flags */
 
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
 	int			whichfork); /* data or attr fork */
 
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int				/* error */
-xfs_bmap_del_extent(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_trans_t		*tp,	/* current trans pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp,/* inode logging flags */
-	int			whichfork, /* data or attr fork */
-	int			rsvd);	 /* OK to allocate reserved blocks */
-
-/*
  * Remove the entry "free" from the free item list.  Prev points to the
  * previous entry, unless "free" is the head of the list.
  */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
 STATIC int				/* error */
 xfs_bmap_add_extent(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork, /* data or attr fork */
-	int			rsvd)	/* OK to use reserved data blocks */
+	int			whichfork) /* data or attr fork */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor or null */
 	xfs_filblks_t		da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
 	xfs_extnum_t		nextents; /* number of extents in file now */
 
 	XFS_STATS_INC(xs_add_exlist);
+
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	ASSERT(idx <= nextents);
 	da_old = da_new = 0;
 	error = 0;
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= nextents);
+
 	/*
 	 * This is the first extent added to a new/empty file.
 	 * Special case this one, so other routines get to assume there are
 	 * already extents in the list.
 	 */
 	if (nextents == 0) {
-		xfs_iext_insert(ip, 0, 1, new,
+		xfs_iext_insert(ip, *idx, 1, new,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
 
 		ASSERT(cur == NULL);
-		ifp->if_lastex = 0;
+
 		if (!isnullstartblock(new->br_startblock)) {
 			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
 			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-				&logflags, rsvd)))
-			goto done;
+		error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+						       &logflags);
 	}
 	/*
 	 * Real allocation off the end of the file.
 	 */
-	else if (idx == nextents) {
+	else if (*idx == nextents) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork)))
-			goto done;
+		error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+				&logflags, whichfork);
 	} else {
 		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
 
 		/*
 		 * Get the record referred to by idx.
 		 */
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
 		/*
 		 * If it's a real allocation record, and the new allocation ends
 		 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
 				if (cur)
 					ASSERT(cur->bc_private.b.flags &
 						XFS_BTCUR_BPRV_WASDEL);
-				if ((error = xfs_bmap_add_extent_delay_real(ip,
-					idx, &cur, new, &da_new, first, flist,
-					&logflags, rsvd)))
-					goto done;
-			} else if (new->br_state == XFS_EXT_NORM) {
-				ASSERT(new->br_state == XFS_EXT_NORM);
-				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
-					goto done;
+				error = xfs_bmap_add_extent_delay_real(ip,
+						idx, &cur, new, &da_new,
+						first, flist, &logflags);
 			} else {
-				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
-				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
+				ASSERT(new->br_state == XFS_EXT_NORM ||
+				       new->br_state == XFS_EXT_UNWRITTEN);
+
+				error = xfs_bmap_add_extent_unwritten_real(ip,
+						idx, &cur, new, &logflags);
+				if (error)
 					goto done;
 			}
-			ASSERT(*curp == cur || *curp == NULL);
 		}
 		/*
 		 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
 			if (cur)
 				ASSERT((cur->bc_private.b.flags &
 					XFS_BTCUR_BPRV_WASDEL) == 0);
-			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork)))
-				goto done;
+			error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+					new, &logflags, whichfork);
 		}
 	}
 
+	if (error)
+		goto done;
 	ASSERT(*curp == cur || *curp == NULL);
+
 	/*
 	 * Convert to a btree if necessary.
 	 */
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
 		ASSERT(nblks <= da_old);
 		if (nblks < da_old)
 			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-				(int64_t)(da_old - nblks), rsvd);
+				(int64_t)(da_old - nblks), 0);
 	}
 	/*
 	 * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			rsvd)	/* OK to use reserved data block allocation */
+	int			*logflagsp) /* inode logging flags */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
 	int			diff;	/* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
 	 */
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, idx);
+	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (idx > 0) {
+	if (*idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, idx, 2, state);
-		ip->i_df.if_lastex = idx - 1;
+		xfs_iext_remove(ip, *idx + 1, 2, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx - 1;
-		xfs_iext_remove(ip, idx, 1, state);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx;
-		xfs_iext_remove(ip, idx + 1, 1, state);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_blockcount, PREV.br_state)))
 				goto done;
 		}
+
 		*dnew = 0;
 		break;
 
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
+
 		*dnew = 0;
 		break;
 
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
 
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
 		*dnew = temp;
 		break;
 
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, idx, 1, new, state);
-		ip->i_df.if_lastex = idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, idx + 1);
+		ep = xfs_iext_get_ext(ifp, *idx + 1);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+
 		*dnew = temp;
 		break;
 
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is contiguous with the new allocation.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount,
 			RIGHT.br_state);
-		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
-		ip->i_df.if_lastex = idx + 1;
+		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_state)))
 				goto done;
 		}
+
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
 		*dnew = temp;
 		break;
 
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is not contiguous.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, idx + 1, 1, new, state);
-		ip->i_df.if_lastex = idx + 1;
+		xfs_iext_insert(ip, *idx + 1, 1, new, state);
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, idx);
+		ep = xfs_iext_get_ext(ifp, *idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
 		*dnew = temp;
 		break;
 
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
 		LEFT = *new;
 		RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
 		RIGHT.br_startoff = new_endoff;
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-		xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
-		ip->i_df.if_lastex = idx + 1;
+		xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
 		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-					     -((int64_t)diff), rsvd)) {
+					     -((int64_t)diff), 0)) {
 			/*
 			 * Ick gross gag me with a spoon.
 			 */
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
 					if (!diff ||
 					    !xfs_icsb_modify_counters(ip->i_mount,
 						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), rsvd))
+						    -((int64_t)diff), 0))
 						break;
 				}
 				if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
 					if (!diff ||
 					    !xfs_icsb_modify_counters(ip->i_mount,
 						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), rsvd))
+						    -((int64_t)diff), 0))
 						break;
 				}
 			}
 		}
-		ep = xfs_iext_get_ext(ifp, idx);
+		ep = xfs_iext_get_ext(ifp, *idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
 			nullstartblock((int)temp2));
-		trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+
+		++*idx;
 		*dnew = temp + temp2;
 		break;
 
@@ -1161,7 +1131,7 @@ done:
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
 	error = 0;
 	cur = *curp;
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, idx);
+	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	newext = new->br_state;
 	oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (idx > 0) {
+	if (*idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, idx, 2, state);
-		ip->i_df.if_lastex = idx - 1;
+		xfs_iext_remove(ip, *idx + 1, 2, state);
 		ip->i_d.di_nextents -= 2;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx - 1;
-		xfs_iext_remove(ip, idx, 1, state);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting all of a previous oldext extent to newext.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
 		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-		ip->i_df.if_lastex = idx;
-		xfs_iext_remove(ip, idx + 1, 1, state);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 		ip->i_d.di_nextents--;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
 
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
 
-		ip->i_df.if_lastex = idx - 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the first part of a previous oldext extent to newext.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
 		xfs_bmbt_set_startblock(ep,
 			new->br_startblock + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		xfs_iext_insert(ip, idx, 1, new, state);
-		ip->i_df.if_lastex = idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is contiguous with the new allocation.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount, newext);
-		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ip->i_df.if_lastex = idx + 1;
 		if (cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
 		 * Setting the last part of a previous oldext extent to newext.
 		 * The right neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
 
-		xfs_iext_insert(ip, idx + 1, 1, new, state);
-		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
 		 * newext.  Contiguity is impossible here.
 		 * One extent becomes three extents.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep,
 			new->br_startoff - PREV.br_startoff);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		r[0] = *new;
 		r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
 			PREV.br_startoff + PREV.br_blockcount - new_endoff;
 		r[1].br_startblock = new->br_startblock + new->br_blockcount;
 		r[1].br_state = oldext;
-		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-		ip->i_df.if_lastex = idx + 1;
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
 		ip->i_d.di_nextents += 2;
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	int			rsvd)		/* OK to allocate reserved blocks */
+	int			*logflagsp) /* inode logging flags */
 {
-	xfs_bmbt_rec_host_t	*ep;	/* extent record for idx */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_filblks_t		newlen=0;	/* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
 
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
 	ASSERT(isnullstartblock(new->br_startblock));
 
 	/*
 	 * Check and set flags if this segment has a left neighbor
 	 */
-	if (idx > 0) {
+	if (*idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
 
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(ep, &right);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the left and on the right.
 		 * Merge all three into a single extent record.
 		 */
+		--*idx;
 		temp = left.br_blockcount + new->br_blockcount +
 			right.br_blockcount;
 
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, idx, 1, state);
-		ip->i_df.if_lastex = idx - 1;
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 		break;
 
 	case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
+		--*idx;
 		temp = left.br_blockcount + new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-
-		ip->i_df.if_lastex = idx - 1;
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		break;
 
 	case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		temp = new->br_blockcount + right.br_blockcount;
 		oldlen = startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_allf(ep, new->br_startoff,
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff,
 			nullstartblock((int)newlen), temp, right.br_state);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-
-		ip->i_df.if_lastex = idx;
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		break;
 
 	case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
 		 * Insert a new entry.
 		 */
 		oldlen = newlen = 0;
-		xfs_iext_insert(ip, idx, 1, new, state);
-		ip->i_df.if_lastex = idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
 		break;
 	}
 	if (oldlen != newlen) {
 		ASSERT(oldlen > newlen);
 		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-			(int64_t)(oldlen - newlen), rsvd);
+			(int64_t)(oldlen - newlen), 0);
 		/*
 		 * Nothing to do for disk quota accounting here.
 		 */
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
 	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork) /* data or attr fork */
 {
-	xfs_bmbt_rec_host_t	*ep;	/* pointer to extent entry ins. point */
 	int			error;	/* error return value */
 	int			i;	/* temp state */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
 	int			state;	/* state bits, accessed thru macros */
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-	ep = xfs_iext_get_ext(ifp, idx);
+	ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
 	state = 0;
 
 	if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (idx > 0) {
+	if (*idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
 	}
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(ep, &right);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 
-		xfs_iext_remove(ip, idx, 1, state);
-		ifp->if_lastex = idx - 1;
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		ifp->if_lastex = idx;
 		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		xfs_iext_insert(ip, idx, 1, new, state);
-		ifp->if_lastex = idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int				/* error */
 xfs_bmap_del_extent(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_trans_t		*tp,	/* current transaction pointer */
-	xfs_extnum_t		idx,	/* extent number to update/delete */
+	xfs_extnum_t		*idx,	/* extent number to update/delete */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork, /* data or attr fork */
-	int			rsvd)	/* OK to allocate reserved blocks */
+	int			whichfork) /* data or attr fork */
 {
 	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
 	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
 		(uint)sizeof(xfs_bmbt_rec_t)));
 	ASSERT(del->br_blockcount > 0);
-	ep = xfs_iext_get_ext(ifp, idx);
+	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &got);
 	ASSERT(got.br_startoff <= del->br_startoff);
 	del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
 		/*
 		 * Matches the whole extent.  Delete the entry.
 		 */
-		xfs_iext_remove(ip, idx, 1,
+		xfs_iext_remove(ip, *idx, 1,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-		ifp->if_lastex = idx;
+		--*idx;
 		if (delay)
 			break;
+
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
 		/*
 		 * Deleting the first part of the extent.
 		 */
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, del_endoff);
 		temp = got.br_blockcount - del->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		ifp->if_lastex = idx;
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
 		xfs_bmbt_set_startblock(ep, del_endblock);
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
 		 * Deleting the last part of the extent.
 		 */
 		temp = got.br_blockcount - del->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		ifp->if_lastex = idx;
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 			da_new = temp;
 			break;
 		}
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		if (!cur) {
 			flags |= xfs_ilog_fext(whichfork);
 			break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
 		 * Deleting the middle of the extent.
 		 */
 		temp = del->br_startoff - got.br_startoff;
-		trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		new.br_startoff = del_endoff;
 		temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
 				}
 			}
 		}
-		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-		xfs_iext_insert(ip, idx + 1, 1, &new, state);
-		ifp->if_lastex = idx + 1;
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+		++*idx;
 		break;
 	}
 	/*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
 	ASSERT(da_old >= da_new);
 	if (da_old > da_new) {
 		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-			(int64_t)(da_old - da_new), rsvd);
+			(int64_t)(da_old - da_new), 0);
 	}
 done:
 	*logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
 				if (rt) {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
-							-((int64_t)extsz), (flags &
-							XFS_BMAPI_RSVBLOCKS));
+							-((int64_t)extsz), 0);
 				} else {
 					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							-((int64_t)alen), (flags &
-							XFS_BMAPI_RSVBLOCKS));
+							-((int64_t)alen), 0);
 				}
 				if (!error) {
 					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							-((int64_t)indlen), (flags &
-							XFS_BMAPI_RSVBLOCKS));
+							-((int64_t)indlen), 0);
 					if (error && rt)
 						xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
-							(int64_t)extsz, (flags &
-							XFS_BMAPI_RSVBLOCKS));
+							(int64_t)extsz, 0);
 					else if (error)
 						xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
-							(int64_t)alen, (flags &
-							XFS_BMAPI_RSVBLOCKS));
+							(int64_t)alen, 0);
 				}
 
 				if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
 				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
 					got.br_state = XFS_EXT_UNWRITTEN;
 			}
-			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
 				firstblock, flist, &tmp_logflags,
-				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+				whichfork);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
-			lastx = ifp->if_lastex;
 			ep = xfs_iext_get_ext(ifp, lastx);
 			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
 			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 						? XFS_EXT_NORM
 						: XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
 				firstblock, flist, &tmp_logflags,
-				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+				whichfork);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
-			lastx = ifp->if_lastex;
 			ep = xfs_iext_get_ext(ifp, lastx);
 			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
 		/*
 		 * Else go on to the next record.
 		 */
-		ep = xfs_iext_get_ext(ifp, ++lastx);
 		prev = got;
-		if (lastx >= nextents)
-			eof = 1;
-		else
+		if (++lastx < nextents) {
+			ep = xfs_iext_get_ext(ifp, lastx);
 			xfs_bmbt_get_all(ep, &got);
+		} else {
+			eof = 1;
+		}
 	}
-	ifp->if_lastex = lastx;
 	*nmap = n;
 	/*
 	 * Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
 	ASSERT(!isnullstartblock(got.br_startblock));
 	ASSERT(bno < got.br_startoff + got.br_blockcount);
 	*fsb = got.br_startblock + (bno - got.br_startoff);
-	ifp->if_lastex = lastx;
 	return 0;
 }
 
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
 	int			tmp_logflags;	/* partial logging flags */
 	int			wasdel;		/* was a delayed alloc extent */
 	int			whichfork;	/* data or attribute fork */
-	int			rsvd;		/* OK to allocate reserved blocks */
 	xfs_fsblock_t		sum;
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
 	mp = ip->i_mount;
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
-	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 	ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
 				del.br_blockcount = mod;
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
 				firstblock, flist, &logflags,
-				XFS_DATA_FORK, 0);
+				XFS_DATA_FORK);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
 				 */
 				ASSERT(bno >= del.br_blockcount);
 				bno -= del.br_blockcount;
-				if (bno < got.br_startoff) {
-					if (--lastx >= 0)
-						xfs_bmbt_get_all(--ep, &got);
+				if (got.br_startoff > bno) {
+					if (--lastx >= 0) {
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+						xfs_bmbt_get_all(ep, &got);
+					}
 				}
 				continue;
 			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
 					prev.br_startoff = start;
 				}
 				prev.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+				lastx--;
+				error = xfs_bmap_add_extent(ip, &lastx, &cur,
 					&prev, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					XFS_DATA_FORK);
 				if (error)
 					goto error0;
 				goto nodelete;
 			} else {
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent(ip, lastx, &cur,
+				error = xfs_bmap_add_extent(ip, &lastx, &cur,
 					&del, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					XFS_DATA_FORK);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
 				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
 				do_div(rtexts, mp->m_sb.sb_rextsize);
 				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-						(int64_t)rtexts, rsvd);
+						(int64_t)rtexts, 0);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
 				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-						(int64_t)del.br_blockcount, rsvd);
+						(int64_t)del.br_blockcount, 0);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
 			error = XFS_ERROR(ENOSPC);
 			goto error0;
 		}
-		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-				&tmp_logflags, whichfork, rsvd);
+		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+				&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
 		bno = del.br_startoff - 1;
 nodelete:
-		lastx = ifp->if_lastex;
 		/*
 		 * If not done go on to the next (previous) record.
-		 * Reset ep in case the extents array was re-alloced.
 		 */
-		ep = xfs_iext_get_ext(ifp, lastx);
 		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-			if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
-			    xfs_bmbt_get_startoff(ep) > bno) {
-				if (--lastx >= 0)
-					ep = xfs_iext_get_ext(ifp, lastx);
-			}
-			if (lastx >= 0)
+			if (lastx >= 0) {
+				ep = xfs_iext_get_ext(ifp, lastx);
+				if (xfs_bmbt_get_startoff(ep) > bno) {
+					if (--lastx >= 0)
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+				}
 				xfs_bmbt_get_all(ep, &got);
+			}
 			extno++;
 		}
 	}
-	ifp->if_lastex = lastx;
 	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191..c62234b 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS	0x020	/* OK to alloc. reserved data blocks */
 #define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
 #define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
 					/* combine contig. space */
@@ -87,7 +86,6 @@ typedef	struct xfs_bmap_free
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
-	{ XFS_BMAPI_RSVBLOCKS,	"RSVBLOCKS" }, \
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8e3349..a098a20 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
 	/*
 	 * We know that the size is valid (it's checked in iformat_btree)
 	 */
-	ifp->if_lastex = NULLEXTNUM;
 	ifp->if_bytes = ifp->if_real_bytes = 0;
 	ifp->if_flags |= XFS_IFEXTENTS;
 	xfs_iext_add(ifp, 0, nextents);
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
 	case XFS_DINODE_FMT_EXTENTS:
 		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
 		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
-		ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
-			(ifp->if_bytes == 0));
-		ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
-			(ifp->if_bytes > 0));
 		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
+			ASSERT(xfs_iext_get_ext(ifp, 0));
 			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
 			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
 				whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
 	xfs_extnum_t	idx)		/* index of target extent */
 {
 	ASSERT(idx >= 0);
+	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 		return ifp->if_u1.if_ext_irec->er_extbuf;
 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
 		}
 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 		ifp->if_real_bytes = 0;
-		ifp->if_lastex = nextents + ext_diff;
 	}
 	/*
 	 * Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
 	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	ASSERT(page_idx >= 0 && page_idx <=
-		ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	ASSERT(page_idx >= 0);
+	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
 	low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a3..3ae6d58 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
 	unsigned char		if_ext_max;	/* max # of extent records */
-	xfs_extnum_t		if_lastex;	/* last if_extents used */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7d56e88..c7755d5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
+#include "xfs_discard.h"
 
 /*
  * Perform initial CIL structure initialisation. If the CIL is not
@@ -361,18 +362,28 @@ xlog_cil_committed(
 	int	abort)
 {
 	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
 
 	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 					ctx->start_lsn, abort);
 
 	xfs_alloc_busy_sort(&ctx->busy_extents);
-	xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
+	xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+			     (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 
 	spin_lock(&ctx->cil->xc_cil_lock);
 	list_del(&ctx->committing);
 	spin_unlock(&ctx->cil->xc_cil_lock);
 
 	xlog_cil_free_logvec(ctx->lv_chain);
+
+	if (!list_empty(&ctx->busy_extents)) {
+		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+		xfs_discard_extents(mp, &ctx->busy_extents);
+		xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+	}
+
 	kmem_free(ctx);
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab..3d68bb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
 						   operations, typically for
 						   disk errors in metadata */
+#define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
 #define XFS_MOUNT_RETERR	(1ULL << 6)     /* return alignment errors to
 						   user */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d1f2485..7c7bc2b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
 	struct xfs_trans	*tp)
 {
 	xfs_alloc_busy_sort(&tp->t_busy);
-	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
+	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
 	atomic_dec(&tp->t_mountp->m_active_trans);
 	xfs_trans_free_dqinfo(tp);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f5df235..503c8a6 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -217,8 +217,24 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+				get_block_t get_block);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
+/* Convert errno to return value from ->page_mkwrite() call */
+static inline int block_page_mkwrite_return(int err)
+{
+	if (err == 0)
+		return VM_FAULT_LOCKED;
+	if (err == -EFAULT)
+		return VM_FAULT_NOPAGE;
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (err == -EAGAIN)
+		return VM_FAULT_RETRY;
+	/* -ENOSPC, -EDQUOT, -EIO ... */
+	return VM_FAULT_SIGBUS;
+}
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
new file mode 100644
index 0000000..04ffb2e
--- /dev/null
+++ b/include/linux/cleancache.h
@@ -0,0 +1,122 @@
+#ifndef _LINUX_CLEANCACHE_H
+#define _LINUX_CLEANCACHE_H
+
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+
+#define CLEANCACHE_KEY_MAX 6
+
+/*
+ * cleancache requires every file with a page in cleancache to have a
+ * unique key unless/until the file is removed/truncated.  For some
+ * filesystems, the inode number is unique, but for "modern" filesystems
+ * an exportable filehandle is required (see exportfs.h)
+ */
+struct cleancache_filekey {
+	union {
+		ino_t ino;
+		__u32 fh[CLEANCACHE_KEY_MAX];
+		u32 key[CLEANCACHE_KEY_MAX];
+	} u;
+};
+
+struct cleancache_ops {
+	int (*init_fs)(size_t);
+	int (*init_shared_fs)(char *uuid, size_t);
+	int (*get_page)(int, struct cleancache_filekey,
+			pgoff_t, struct page *);
+	void (*put_page)(int, struct cleancache_filekey,
+			pgoff_t, struct page *);
+	void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
+	void (*flush_inode)(int, struct cleancache_filekey);
+	void (*flush_fs)(int);
+};
+
+extern struct cleancache_ops
+	cleancache_register_ops(struct cleancache_ops *ops);
+extern void __cleancache_init_fs(struct super_block *);
+extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern int  __cleancache_get_page(struct page *);
+extern void __cleancache_put_page(struct page *);
+extern void __cleancache_flush_page(struct address_space *, struct page *);
+extern void __cleancache_flush_inode(struct address_space *);
+extern void __cleancache_flush_fs(struct super_block *);
+extern int cleancache_enabled;
+
+#ifdef CONFIG_CLEANCACHE
+static inline bool cleancache_fs_enabled(struct page *page)
+{
+	return page->mapping->host->i_sb->cleancache_poolid >= 0;
+}
+static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
+{
+	return mapping->host->i_sb->cleancache_poolid >= 0;
+}
+#else
+#define cleancache_enabled (0)
+#define cleancache_fs_enabled(_page) (0)
+#define cleancache_fs_enabled_mapping(_page) (0)
+#endif
+
+/*
+ * The shim layer provided by these inline functions allows the compiler
+ * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
+ * is disabled, to a single global variable check if CONFIG_CLEANCACHE
+ * is enabled but no cleancache "backend" has dynamically enabled it,
+ * and, for the most frequent cleancache ops, to a single global variable
+ * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
+ * and a cleancache backend has dynamically enabled cleancache, but the
+ * filesystem referenced by that cleancache op has not enabled cleancache.
+ * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
+ * no measurable performance impact.
+ */
+
+static inline void cleancache_init_fs(struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_init_fs(sb);
+}
+
+static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_init_shared_fs(uuid, sb);
+}
+
+static inline int cleancache_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (cleancache_enabled && cleancache_fs_enabled(page))
+		ret = __cleancache_get_page(page);
+	return ret;
+}
+
+static inline void cleancache_put_page(struct page *page)
+{
+	if (cleancache_enabled && cleancache_fs_enabled(page))
+		__cleancache_put_page(page);
+}
+
+static inline void cleancache_flush_page(struct address_space *mapping,
+					struct page *page)
+{
+	/* careful... page->mapping is NULL sometimes when this is called */
+	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+		__cleancache_flush_page(mapping, page);
+}
+
+static inline void cleancache_flush_inode(struct address_space *mapping)
+{
+	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+		__cleancache_flush_inode(mapping);
+}
+
+static inline void cleancache_flush_fs(struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_flush_fs(sb);
+}
+
+#endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f9d325..2416093 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1428,6 +1428,11 @@ struct super_block {
 	 */
 	char __rcu *s_options;
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
+
+	/*
+	 * Saved pool identifier for cleancache (-1 means none)
+	 */
+	int cleancache_poolid;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 943c76b..59225ef 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_HUGETLB_H
 #define _LINUX_HUGETLB_H
 
+#include <linux/mm_types.h>
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
 
@@ -41,7 +42,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
-						int acctflags);
+						vm_flags_t vm_flags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
@@ -168,7 +169,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
@@ -192,7 +193,7 @@ static inline void set_file_hugepages(struct file *file)
 #define is_file_hugepages(file)			0
 #define set_file_hugepages(file)		BUG()
 static inline struct file *hugetlb_file_setup(const char *name, size_t size,
-		int acctflag, struct user_struct **user, int creat_flags)
+		vm_flags_t acctflag, struct user_struct **user, int creat_flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 6931489..2bb681f 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return vma->vm_flags & VM_HUGETLB;
+	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a32dcae..4ecb7b1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -529,9 +529,10 @@ struct transaction_s
 	enum {
 		T_RUNNING,
 		T_LOCKED,
-		T_RUNDOWN,
 		T_FLUSH,
 		T_COMMIT,
+		T_COMMIT_DFLUSH,
+		T_COMMIT_JFLUSH,
 		T_FINISHED
 	}			t_state;
 
@@ -658,7 +659,9 @@ struct transaction_s
 	 * waiting for it to finish.
 	 */
 	unsigned int t_synchronous_commit:1;
-	unsigned int t_flushed_data_blocks:1;
+
+	/* Disk flush needs to be sent to fs partition [no locking] */
+	int			t_need_data_flush;
 
 	/*
 	 * For use by the filesystem to store fs-specific data
@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8eb969e..fb8e814 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -165,12 +165,12 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return (vma->vm_flags & VM_PFN_AT_MMAP);
+	return !!(vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
 {
-	return (vma->vm_flags & VM_PFNMAP);
+	return !!(vma->vm_flags & VM_PFNMAP);
 }
 
 /*
@@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	unsigned int vm_flags, unsigned long pgoff);
+	vm_flags_t vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 071d459..6fe96c1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -102,6 +102,8 @@ struct page {
 #endif
 };
 
+typedef unsigned long __nocast vm_flags_t;
+
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
  * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -109,7 +111,7 @@ struct page {
  */
 struct vm_region {
 	struct rb_node	vm_rb;		/* link in global region tree */
-	unsigned long	vm_flags;	/* VMA vm_flags */
+	vm_flags_t	vm_flags;	/* VMA vm_flags */
 	unsigned long	vm_start;	/* start address of region */
 	unsigned long	vm_end;		/* region initialised to here */
 	unsigned long	vm_top;		/* region allocated to here */
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 4c4ac3f..a9dd895 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -24,6 +24,7 @@
 /* leave room for NETLINK_DM (DM Events) */
 #define NETLINK_SCSITRANSPORT	18	/* SCSI Transports */
 #define NETLINK_ECRYPTFS	19
+#define NETLINK_RDMA		20
 
 #define MAX_LINKS 32		
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 74243c8..7ad824d 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -98,16 +98,6 @@ void ipi_call_unlock_irq(void);
  */
 int on_each_cpu(smp_call_func_t func, void *info, int wait);
 
-#define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
-#define MSG_ALL			0x8001
-
-#define MSG_INVALIDATE_TLB	0x0001	/* Remote processor TLB invalidate */
-#define MSG_STOP_CPU		0x0002	/* Sent to shut down slave CPU's
-					 * when rebooting
-					 */
-#define MSG_RESCHEDULE		0x0003	/* Reschedule request from master CPU*/
-#define MSG_CALL_FUNCTION       0x0004  /* Call function on all other CPUs */
-
 /*
  * Mark the boot cpu "online" so that it can call console drivers in
  * printk() and can access its per-cpu storage.
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index b4d7710..bb4f5fb 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -581,7 +581,7 @@ extern int spi_bus_unlock(struct spi_master *master);
  * Callable only from contexts that can sleep.
  */
 static inline int
-spi_write(struct spi_device *spi, const u8 *buf, size_t len)
+spi_write(struct spi_device *spi, const void *buf, size_t len)
 {
 	struct spi_transfer	t = {
 			.tx_buf		= buf,
@@ -605,7 +605,7 @@ spi_write(struct spi_device *spi, const u8 *buf, size_t len)
  * Callable only from contexts that can sleep.
  */
 static inline int
-spi_read(struct spi_device *spi, u8 *buf, size_t len)
+spi_read(struct spi_device *spi, void *buf, size_t len)
 {
 	struct spi_transfer	t = {
 			.rx_buf		= buf,
@@ -620,8 +620,8 @@ spi_read(struct spi_device *spi, u8 *buf, size_t len)
 
 /* this copies txbuf and rxbuf data; for small transfers only! */
 extern int spi_write_then_read(struct spi_device *spi,
-		const u8 *txbuf, unsigned n_tx,
-		u8 *rxbuf, unsigned n_rx);
+		const void *txbuf, unsigned n_tx,
+		void *rxbuf, unsigned n_rx);
 
 /**
  * spi_w8r8 - SPI synchronous 8 bit write followed by 8 bit read
diff --git a/include/rdma/Kbuild b/include/rdma/Kbuild
index e7c0432..ea56f76 100644
--- a/include/rdma/Kbuild
+++ b/include/rdma/Kbuild
@@ -1 +1,6 @@
+header-y += ib_user_cm.h
 header-y += ib_user_mad.h
+header-y += ib_user_sa.h
+header-y += ib_user_verbs.h
+header-y += rdma_netlink.h
+header-y += rdma_user_cm.h
diff --git a/include/rdma/ib_user_cm.h b/include/rdma/ib_user_cm.h
index bd3d380..f79014a 100644
--- a/include/rdma/ib_user_cm.h
+++ b/include/rdma/ib_user_cm.h
@@ -34,6 +34,7 @@
 #ifndef IB_USER_CM_H
 #define IB_USER_CM_H
 
+#include <linux/types.h>
 #include <rdma/ib_user_sa.h>
 
 #define IB_USER_CM_ABI_VERSION 5
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 169f7a5..26977c1 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -111,6 +111,20 @@ struct rdma_cm_event {
 	} param;
 };
 
+enum rdma_cm_state {
+	RDMA_CM_IDLE,
+	RDMA_CM_ADDR_QUERY,
+	RDMA_CM_ADDR_RESOLVED,
+	RDMA_CM_ROUTE_QUERY,
+	RDMA_CM_ROUTE_RESOLVED,
+	RDMA_CM_CONNECT,
+	RDMA_CM_DISCONNECT,
+	RDMA_CM_ADDR_BOUND,
+	RDMA_CM_LISTEN,
+	RDMA_CM_DEVICE_REMOVAL,
+	RDMA_CM_DESTROYING
+};
+
 struct rdma_cm_id;
 
 /**
@@ -130,6 +144,7 @@ struct rdma_cm_id {
 	rdma_cm_event_handler	 event_handler;
 	struct rdma_route	 route;
 	enum rdma_port_space	 ps;
+	enum ib_qp_type		 qp_type;
 	u8			 port_num;
 };
 
@@ -140,9 +155,11 @@ struct rdma_cm_id {
  *   returned rdma_id.
  * @context: User specified context associated with the id.
  * @ps: RDMA port space.
+ * @qp_type: type of queue pair associated with the id.
  */
 struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
-				  void *context, enum rdma_port_space ps);
+				  void *context, enum rdma_port_space ps,
+				  enum ib_qp_type qp_type);
 
 /**
   * rdma_destroy_id - Destroys an RDMA identifier.
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
new file mode 100644
index 0000000..3c5363a
--- /dev/null
+++ b/include/rdma/rdma_netlink.h
@@ -0,0 +1,92 @@
+#ifndef _RDMA_NETLINK_H
+#define _RDMA_NETLINK_H
+
+#include <linux/types.h>
+
+enum {
+	RDMA_NL_RDMA_CM = 1
+};
+
+#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10)
+#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
+#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
+
+enum {
+	RDMA_NL_RDMA_CM_ID_STATS = 0,
+	RDMA_NL_RDMA_CM_NUM_OPS
+};
+
+enum {
+	RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
+	RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
+	RDMA_NL_RDMA_CM_NUM_ATTR,
+};
+
+struct rdma_cm_id_stats {
+	__u32	qp_num;
+	__u32	bound_dev_if;
+	__u32	port_space;
+	__s32	pid;
+	__u8	cm_state;
+	__u8	node_type;
+	__u8	port_num;
+	__u8	qp_type;
+};
+
+#ifdef __KERNEL__
+
+#include <linux/netlink.h>
+
+struct ibnl_client_cbs {
+	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
+};
+
+int ibnl_init(void);
+void ibnl_cleanup(void);
+
+/**
+ * Add a a client to the list of IB netlink exporters.
+ * @index: Index of the added client
+ * @nops: Number of supported ops by the added client.
+ * @cb_table: A table for op->callback
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_add_client(int index, int nops,
+		    const struct ibnl_client_cbs cb_table[]);
+
+/**
+ * Remove a client from IB netlink.
+ * @index: Index of the removed IB client.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_remove_client(int index);
+
+/**
+ * Put a new message in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Pointer to put the header of the new netlink message.
+ * @seq: The message sequence number.
+ * @len: The requested message length to allocate.
+ * @client: Calling IB netlink client.
+ * @op: message content op.
+ * Returns the allocated buffer on success and NULL on failure.
+ */
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op);
+/**
+ * Put a new attribute in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Header of the netlink message to append the attribute to.
+ * @len: The length of the attribute data.
+ * @data: The attribute data to put.
+ * @type: The attribute type.
+ * Returns the 0 and a negative error code on failure.
+ */
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type);
+
+#endif /* __KERNEL__ */
+
+#endif /* _RDMA_NETLINK_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index b33257b..70213b4 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -461,6 +462,27 @@ typedef uint8_t xen_domain_handle_t[16];
 #define __mk_unsigned_long(x) x ## UL
 #define mk_unsigned_long(x) __mk_unsigned_long(x)
 
+#define TMEM_SPEC_VERSION 1
+
+struct tmem_op {
+	uint32_t cmd;
+	int32_t pool_id;
+	union {
+		struct {  /* for cmd == TMEM_NEW_POOL */
+			uint64_t uuid[2];
+			uint32_t flags;
+		} new;
+		struct {
+			uint64_t oid[3];
+			uint32_t index;
+			uint32_t tmem_offset;
+			uint32_t pfn_offset;
+			uint32_t len;
+			GUEST_HANDLE(void) gmfn; /* guest machine page frame */
+		} gen;
+	} u;
+};
+
 #else /* __ASSEMBLY__ */
 
 /* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/ipc/shm.c b/ipc/shm.c
index 729acb7..ab3385a 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	struct file * file;
 	char name[13];
 	int id;
-	int acctflag = 0;
+	vm_flags_t acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61..8ca47a5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
 	depends on !SMP
 	bool
 	default y
+
+config CLEANCACHE
+	bool "Enable cleancache driver to cache clean pages if tmem is present"
+	default n
+	help
+	  Cleancache can be thought of as a page-granularity victim cache
+	  for clean pages that the kernel's pageframe replacement algorithm
+	  (PFRA) would like to keep around, but can't since there isn't enough
+	  memory.  So when the PFRA "evicts" a page, it first attempts to use
+	  cleancacne code to put the data contained in that page into
+	  "transcendent memory", memory that is not directly accessible or
+	  addressable by the kernel and is of unknown and possibly
+	  time-varying size.  And when a cleancache-enabled
+	  filesystem wishes to access a page in a file on disk, it first
+	  checks cleancache to see if it already contains it; if it does,
+	  the page is copied into the kernel and a disk access is avoided.
+	  When a transcendent memory driver is available (such as zcache or
+	  Xen transcendent memory), a significant I/O reduction
+	  may be achieved.  When none is available, all cleancache calls
+	  are reduced to a single pointer-compare-against-NULL resulting
+	  in a negligible performance hit.
+
+	  If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326..836e416 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 0000000..bcaae4c
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache.  See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+	struct cleancache_ops old = cleancache_ops;
+
+	cleancache_ops = *ops;
+	cleancache_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+	sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+	sb->cleancache_poolid =
+		(*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+			      struct cleancache_filekey *key)
+{
+	int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+	int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+	struct super_block *sb = inode->i_sb;
+
+	key->u.ino = inode->i_ino;
+	if (sb->s_export_op != NULL) {
+		fhfn = sb->s_export_op->encode_fh;
+		if  (fhfn) {
+			struct dentry d;
+			d.d_inode = inode;
+			len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+			if (len <= 0 || len == 255)
+				return -1;
+			if (maxlen > CLEANCACHE_KEY_MAX)
+				return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+	int ret = -1;
+	int pool_id;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	VM_BUG_ON(!PageLocked(page));
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id < 0)
+		goto out;
+
+	if (cleancache_get_key(page->mapping->host, &key) < 0)
+		goto out;
+
+	ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+	if (ret == 0)
+		cleancache_succ_gets++;
+	else
+		cleancache_failed_gets++;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index.  Page must be locked.  Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+	int pool_id;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	VM_BUG_ON(!PageLocked(page));
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id >= 0 &&
+	      cleancache_get_key(page->mapping->host, &key) >= 0) {
+		(*cleancache_ops.put_page)(pool_id, key, page->index, page);
+		cleancache_puts++;
+	}
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+	/* careful... page->mapping is NULL sometimes when this is called */
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	if (pool_id >= 0) {
+		VM_BUG_ON(!PageLocked(page));
+		if (cleancache_get_key(mapping->host, &key) >= 0) {
+			(*cleancache_ops.flush_page)(pool_id, key, page->index);
+			cleancache_flushes++;
+		}
+	}
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+		(*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+	if (sb->cleancache_poolid >= 0) {
+		int old_poolid = sb->cleancache_poolid;
+		sb->cleancache_poolid = -1;
+		(*cleancache_ops.flush_fs)(old_poolid);
+	}
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+
+#define CLEANCACHE_SYSFS_RO(_name) \
+	static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+		return sprintf(buf, "%lu\n", cleancache_##_name); \
+	} \
+	static struct kobj_attribute cleancache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = cleancache_##_name##_show, \
+	}
+
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+
+static struct attribute *cleancache_attrs[] = {
+	&cleancache_succ_gets_attr.attr,
+	&cleancache_failed_gets_attr.attr,
+	&cleancache_puts_attr.attr,
+	&cleancache_flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group cleancache_attr_group = {
+	.attrs = cleancache_attrs,
+	.name = "cleancache",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+	return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index 68e782b..7455ccd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 
 /*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
+	/*
+	 * if we're uptodate, flush out into the cleancache, otherwise
+	 * invalidate any existing cleancache entries.  We can't leave
+	 * stale data around in the cleancache once our page is gone
+	 */
+	if (PageUptodate(page) && PageMappedToDisk(page))
+		cleancache_put_page(page);
+	else
+		cleancache_flush_page(mapping, page);
+
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f41230..b8e0e2d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		/*
 		 * drop PG_Mlocked flag for over-mapped range
 		 */
-		unsigned int saved_flags = vma->vm_flags;
+		vm_flags_t saved_flags = vma->vm_flags;
 		munlock_vma_pages_range(vma, start, start + size);
 		vma->vm_flags = saved_flags;
 	}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5fd68b9..f33bb31 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
-					int acctflag)
+					vm_flags_t vm_flags)
 {
 	long ret, chg;
 	struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * attempt will be made for VM_NORESERVE to allocate a page
 	 * and filesystem quota without using reserves
 	 */
-	if (acctflag & VM_NORESERVE)
+	if (vm_flags & VM_NORESERVE)
 		return 0;
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index b73f677..fc24f7d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 	add_taint(TAINT_BAD_PAGE);
 }
 
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2..048260c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
  * For vmas that pass the filters, merge/split as appropriate.
  */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-	unsigned long start, unsigned long end, unsigned int newflags)
+	unsigned long start, unsigned long end, vm_flags_t newflags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int nr_pages;
 	int ret = 0;
-	int lock = newflags & VM_LOCKED;
+	int lock = !!(newflags & VM_LOCKED);
 
 	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 		prev = vma;
 
 	for (nstart = start ; ; ) {
-		unsigned int newflags;
+		vm_flags_t newflags;
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
 		goto out;
 
 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-		unsigned int newflags;
+		vm_flags_t newflags;
 
 		newflags = vma->vm_flags | VM_LOCKED;
 		if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index ac2631b..bbdc9af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
-	unsigned int vm_flags;
+	vm_flags_t vm_flags;
 	int error;
 	unsigned long reqprot = prot;
 
@@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-	unsigned int vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 
 	/* If it was private or non-writable, the write bit is already clear */
 	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
 	/*
 	 * hugetlb has its own accounting separate from the core VM
@@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  unsigned int vm_flags, unsigned long pgoff)
+			  vm_flags_t vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
diff --git a/mm/truncate.c b/mm/truncate.c
index a956675..3a29a61 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+	cleancache_flush_page(page->mapping, page);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t next;
 	int i;
 
+	cleancache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 	}
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
+	cleancache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
+	cleancache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 844a7a5..159c50f 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -589,7 +589,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 		return -ENOMEM;
 
 	/* Create the RDMA CM ID */
-	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP);
+	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
+				     IB_QPT_RC);
 	if (IS_ERR(rdma->cm_id))
 		goto error;
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index cce19f95..3b83086 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -325,7 +325,7 @@ static int rds_ib_laddr_check(__be32 addr)
 	/* Create a CMA ID and try to bind it. This catches both
 	 * IB and iWARP capable NICs.
 	 */
-	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index ee369d2..fd453dd 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -587,7 +587,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
 	/* XXX I wonder what affect the port space has */
 	/* delegate cm event handler to rdma_transport */
 	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-				     RDMA_PS_TCP);
+				     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(ic->i_cm_id)) {
 		ret = PTR_ERR(ic->i_cm_id);
 		ic->i_cm_id = NULL;
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 5a9676f..f747484 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -226,7 +226,7 @@ static int rds_iw_laddr_check(__be32 addr)
 	/* Create a CMA ID and try to bind it. This catches both
 	 * IB and iWARP capable NICs.
 	 */
-	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 3a60a15..c12db66 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -522,7 +522,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
 	/* XXX I wonder what affect the port space has */
 	/* delegate cm event handler to rdma_transport */
 	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-				     RDMA_PS_TCP);
+				     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(ic->i_cm_id)) {
 		ret = PTR_ERR(ic->i_cm_id);
 		ic->i_cm_id = NULL;
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 4195a05..f8760e1 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -158,7 +158,8 @@ static int rds_rdma_listen_init(void)
 	struct rdma_cm_id *cm_id;
 	int ret;
 
-	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
+	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
+			       IB_QPT_RC);
 	if (IS_ERR(cm_id)) {
 		ret = PTR_ERR(cm_id);
 		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 6c014dd..c3c232a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -695,7 +695,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 		return ERR_PTR(-ENOMEM);
 	xprt = &cma_xprt->sc_xprt;
 
-	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
+	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
+				   IB_QPT_RC);
 	if (IS_ERR(listen_id)) {
 		ret = PTR_ERR(listen_id);
 		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index d4297dc..80f8da3 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -387,7 +387,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 
 	init_completion(&ia->ri_done);
 
-	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
+	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(id)) {
 		rc = PTR_ERR(id);
 		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
diff --git a/sound/soc/omap/Kconfig b/sound/soc/omap/Kconfig
index b592298..99054cf 100644
--- a/sound/soc/omap/Kconfig
+++ b/sound/soc/omap/Kconfig
@@ -65,14 +65,6 @@ config SND_OMAP_SOC_OVERO
 	  Say Y if you want to add support for SoC audio on the
 	  Gumstix Overo or CompuLab CM-T35
 
-config SND_OMAP_SOC_OMAP2EVM
-	tristate "SoC Audio support for OMAP2EVM board"
-	depends on TWL4030_CORE && SND_OMAP_SOC && MACH_OMAP2EVM
-	select SND_OMAP_SOC_MCBSP
-	select SND_SOC_TWL4030
-	help
-	  Say Y if you want to add support for SoC audio on the omap2evm board.
-
 config SND_OMAP_SOC_OMAP3EVM
 	tristate "SoC Audio support for OMAP3EVM board"
 	depends on TWL4030_CORE && SND_OMAP_SOC && MACH_OMAP3EVM
diff --git a/sound/soc/omap/Makefile b/sound/soc/omap/Makefile
index ba9fc65..6c2c87e 100644
--- a/sound/soc/omap/Makefile
+++ b/sound/soc/omap/Makefile
@@ -13,7 +13,6 @@ snd-soc-rx51-objs := rx51.o
 snd-soc-ams-delta-objs := ams-delta.o
 snd-soc-osk5912-objs := osk5912.o
 snd-soc-overo-objs := overo.o
-snd-soc-omap2evm-objs := omap2evm.o
 snd-soc-omap3evm-objs := omap3evm.o
 snd-soc-am3517evm-objs := am3517evm.o
 snd-soc-sdp3430-objs := sdp3430.o
diff --git a/sound/soc/omap/omap2evm.c b/sound/soc/omap/omap2evm.c
deleted file mode 100644
index 29b60d6..0000000
--- a/sound/soc/omap/omap2evm.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * omap2evm.c  --  SoC audio machine driver for omap2evm board
- *
- * Author: Arun KS <arunks@mistralsolutions.com>
- *
- * Based on sound/soc/omap/overo.c by Steve Sakoman
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
- */
-
-#include <linux/clk.h>
-#include <linux/platform_device.h>
-#include <sound/core.h>
-#include <sound/pcm.h>
-#include <sound/soc.h>
-
-#include <asm/mach-types.h>
-#include <mach/hardware.h>
-#include <mach/gpio.h>
-#include <plat/mcbsp.h>
-
-#include "omap-mcbsp.h"
-#include "omap-pcm.h"
-
-static int omap2evm_hw_params(struct snd_pcm_substream *substream,
-	struct snd_pcm_hw_params *params)
-{
-	struct snd_soc_pcm_runtime *rtd = substream->private_data;
-	struct snd_soc_dai *codec_dai = rtd->codec_dai;
-	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
-	int ret;
-
-	/* Set codec DAI configuration */
-	ret = snd_soc_dai_set_fmt(codec_dai,
-				  SND_SOC_DAIFMT_I2S |
-				  SND_SOC_DAIFMT_NB_NF |
-				  SND_SOC_DAIFMT_CBM_CFM);
-	if (ret < 0) {
-		printk(KERN_ERR "can't set codec DAI configuration\n");
-		return ret;
-	}
-
-	/* Set cpu DAI configuration */
-	ret = snd_soc_dai_set_fmt(cpu_dai,
-				  SND_SOC_DAIFMT_I2S |
-				  SND_SOC_DAIFMT_NB_NF |
-				  SND_SOC_DAIFMT_CBM_CFM);
-	if (ret < 0) {
-		printk(KERN_ERR "can't set cpu DAI configuration\n");
-		return ret;
-	}
-
-	/* Set the codec system clock for DAC and ADC */
-	ret = snd_soc_dai_set_sysclk(codec_dai, 0, 26000000,
-					    SND_SOC_CLOCK_IN);
-	if (ret < 0) {
-		printk(KERN_ERR "can't set codec system clock\n");
-		return ret;
-	}
-
-	return 0;
-}
-
-static struct snd_soc_ops omap2evm_ops = {
-	.hw_params = omap2evm_hw_params,
-};
-
-/* Digital audio interface glue - connects codec <--> CPU */
-static struct snd_soc_dai_link omap2evm_dai = {
-	.name = "TWL4030",
-	.stream_name = "TWL4030",
-	.cpu_dai_name = "omap-mcbsp-dai.1",
-	.codec_dai_name = "twl4030-hifi",
-	.platform_name = "omap-pcm-audio",
-	.codec_name = "twl4030-codec",
-	.ops = &omap2evm_ops,
-};
-
-/* Audio machine driver */
-static struct snd_soc_card snd_soc_omap2evm = {
-	.name = "omap2evm",
-	.dai_link = &omap2evm_dai,
-	.num_links = 1,
-};
-
-static struct platform_device *omap2evm_snd_device;
-
-static int __init omap2evm_soc_init(void)
-{
-	int ret;
-
-	if (!machine_is_omap2evm())
-		return -ENODEV;
-	printk(KERN_INFO "omap2evm SoC init\n");
-
-	omap2evm_snd_device = platform_device_alloc("soc-audio", -1);
-	if (!omap2evm_snd_device) {
-		printk(KERN_ERR "Platform device allocation failed\n");
-		return -ENOMEM;
-	}
-
-	platform_set_drvdata(omap2evm_snd_device, &snd_soc_omap2evm);
-
-	ret = platform_device_add(omap2evm_snd_device);
-	if (ret)
-		goto err1;
-
-	return 0;
-
-err1:
-	printk(KERN_ERR "Unable to add platform device\n");
-	platform_device_put(omap2evm_snd_device);
-
-	return ret;
-}
-module_init(omap2evm_soc_init);
-
-static void __exit omap2evm_soc_exit(void)
-{
-	platform_device_unregister(omap2evm_snd_device);
-}
-module_exit(omap2evm_soc_exit);
-
-MODULE_AUTHOR("Arun KS <arunks@mistralsolutions.com>");
-MODULE_DESCRIPTION("ALSA SoC omap2evm");
-MODULE_LICENSE("GPL");