157 files changed, 8708 insertions, 2893 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index caa7c3a..6bc2d87 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8373,9 +8373,14 @@ M:	Chris Metcalf <cmetcalf@tilera.com>
 W:	http://www.tilera.com/scm/
 S:	Supported
 F:	arch/tile/
-F:	drivers/tty/hvc/hvc_tile.c
-F:	drivers/net/ethernet/tile/
+F:	drivers/char/tile-srom.c
 F:	drivers/edac/tile_edac.c
+F:	drivers/net/ethernet/tile/
+F:	drivers/rtc/rtc-tile.c
+F:	drivers/tty/hvc/hvc_tile.c
+F:	drivers/tty/serial/tilegx.c
+F:	drivers/usb/host/*-tilegx.c
+F:	include/linux/usb/tilegx.h
 
 TLAN NETWORK DRIVER
 M:	Samuel Chessman <chessman@tux.org>
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 24565a7..6e1ed55 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -26,6 +26,7 @@ config TILE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select HAVE_DEBUG_STACKOVERFLOW
+	select ARCH_WANT_FRAME_POINTERS
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
@@ -64,6 +65,9 @@ config HUGETLB_SUPER_PAGES
 	depends on HUGETLB_PAGE && TILEGX
 	def_bool y
 
+config GENERIC_TIME_VSYSCALL
+	def_bool y
+
 # FIXME: tilegx can implement a more efficient rwsem.
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
@@ -112,10 +116,19 @@ config SMP
 config HVC_TILE
 	depends on TTY
 	select HVC_DRIVER
+	select HVC_IRQ if TILEGX
 	def_bool y
 
 config TILEGX
-	bool "Building with TILE-Gx (64-bit) compiler and toolchain"
+	bool "Building for TILE-Gx (64-bit) processor"
+	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_KPROBES
+	select HAVE_KRETPROBES
+	select HAVE_ARCH_KGDB
 
 config TILEPRO
 	def_bool !TILEGX
@@ -194,7 +207,7 @@ config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
-# We do not currently support disabling HIGHMEM on tile64 and tilepro.
+# We do not currently support disabling HIGHMEM on tilepro.
 config HIGHMEM
 	bool # "Support for more than 512 MB of RAM"
 	default !TILEGX
@@ -300,6 +313,8 @@ config PAGE_OFFSET
 
 source "mm/Kconfig"
 
+source "kernel/Kconfig.preempt"
+
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
 	default n
@@ -396,8 +411,20 @@ config NO_IOMEM
 config NO_IOPORT
 	def_bool !PCI
 
+config TILE_PCI_IO
+	bool "PCI I/O space support"
+	default n
+	depends on PCI
+	depends on TILEGX
+	---help---
+	  Enable PCI I/O space support on TILEGx. Since the PCI I/O space
+	  is used by few modern PCIe endpoint devices, its support is disabled
+	  by default to save the TRIO PIO Region resource for other purposes.
+
 source "drivers/pci/Kconfig"
 
+source "drivers/pci/pcie/Kconfig"
+
 config TILE_USB
 	tristate "Tilera USB host adapter support"
 	default y
diff --git a/arch/tile/Kconfig.debug b/arch/tile/Kconfig.debug
index 9165ea9..19734d3 100644
--- a/arch/tile/Kconfig.debug
+++ b/arch/tile/Kconfig.debug
@@ -14,14 +14,12 @@ config EARLY_PRINTK
 	  with klogd/syslogd. You should normally N here,
 	  unless you want to debug such a crash.
 
-config DEBUG_EXTRA_FLAGS
-	string "Additional compiler arguments when building with '-g'"
-	depends on DEBUG_INFO
-	default ""
+config TILE_HVGLUE_TRACE
+	bool "Provide wrapper functions for hypervisor ABI calls"
+	default n
 	help
-	  Debug info can be large, and flags like
-	  `-femit-struct-debug-baseonly' can reduce the kernel file
-	  size and build time noticeably.  Such flags are often
-	  helpful if the main use of debug info is line number info.
+	  Provide wrapper functions for the hypervisor ABI calls
+	  defined in arch/tile/kernel/hvglue.S.  This allows tracing
+	  mechanisms, etc., to have visibility into those calls.
 
 endmenu
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..4dc380a 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -30,10 +30,6 @@ endif
 # In kernel modules, this causes load failures due to unsupported relocations.
 KBUILD_CFLAGS   += -fno-asynchronous-unwind-tables
 
-ifneq ($(CONFIG_DEBUG_EXTRA_FLAGS),"")
-KBUILD_CFLAGS   += $(CONFIG_DEBUG_EXTRA_FLAGS)
-endif
-
 LIBGCC_PATH     := \
   $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)
 
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index 4768481..730e40d 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -1,16 +1,15 @@
 CONFIG_TILEGX=y
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -18,18 +17,18 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEMCG=y
-CONFIG_CGROUP_MEMCG_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KPROBES=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
@@ -45,12 +44,12 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_NR_CPUS=100
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_TILE_PCI_IO=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -108,150 +107,9 @@ CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_MROUTE=y
 CONFIG_IPV6_PIMSM_V2=y
 CONFIG_NETLABEL=y
-CONFIG_NETFILTER=y
-CONFIG_NF_CONNTRACK=m
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_ZONES=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CT_PROTO_DCCP=m
-CONFIG_NF_CT_PROTO_UDPLITE=m
-CONFIG_NF_CONNTRACK_AMANDA=m
-CONFIG_NF_CONNTRACK_FTP=m
-CONFIG_NF_CONNTRACK_H323=m
-CONFIG_NF_CONNTRACK_IRC=m
-CONFIG_NF_CONNTRACK_NETBIOS_NS=m
-CONFIG_NF_CONNTRACK_PPTP=m
-CONFIG_NF_CONNTRACK_SANE=m
-CONFIG_NF_CONNTRACK_SIP=m
-CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
-CONFIG_NETFILTER_XT_TARGET_DSCP=m
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
-CONFIG_NETFILTER_XT_TARGET_MARK=m
-CONFIG_NETFILTER_XT_TARGET_NFLOG=m
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
-CONFIG_NETFILTER_XT_TARGET_TEE=m
-CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
-CONFIG_NETFILTER_XT_TARGET_SECMARK=m
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
-CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
-CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
-CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
-CONFIG_NETFILTER_XT_MATCH_DSCP=m
-CONFIG_NETFILTER_XT_MATCH_ESP=m
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_HELPER=m
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
-CONFIG_NETFILTER_XT_MATCH_IPVS=m
-CONFIG_NETFILTER_XT_MATCH_LENGTH=m
-CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-CONFIG_NETFILTER_XT_MATCH_MAC=m
-CONFIG_NETFILTER_XT_MATCH_MARK=m
-CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
-CONFIG_NETFILTER_XT_MATCH_OSF=m
-CONFIG_NETFILTER_XT_MATCH_OWNER=m
-CONFIG_NETFILTER_XT_MATCH_POLICY=m
-CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
-CONFIG_NETFILTER_XT_MATCH_QUOTA=m
-CONFIG_NETFILTER_XT_MATCH_RATEEST=m
-CONFIG_NETFILTER_XT_MATCH_REALM=m
-CONFIG_NETFILTER_XT_MATCH_RECENT=m
-CONFIG_NETFILTER_XT_MATCH_SOCKET=m
-CONFIG_NETFILTER_XT_MATCH_STATE=m
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
-CONFIG_NETFILTER_XT_MATCH_STRING=m
-CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_NETFILTER_XT_MATCH_TIME=m
-CONFIG_NETFILTER_XT_MATCH_U32=m
-CONFIG_IP_VS=m
-CONFIG_IP_VS_IPV6=y
-CONFIG_IP_VS_PROTO_TCP=y
-CONFIG_IP_VS_PROTO_UDP=y
-CONFIG_IP_VS_PROTO_ESP=y
-CONFIG_IP_VS_PROTO_AH=y
-CONFIG_IP_VS_PROTO_SCTP=y
-CONFIG_IP_VS_RR=m
-CONFIG_IP_VS_WRR=m
-CONFIG_IP_VS_LC=m
-CONFIG_IP_VS_WLC=m
-CONFIG_IP_VS_LBLC=m
-CONFIG_IP_VS_LBLCR=m
-CONFIG_IP_VS_SED=m
-CONFIG_IP_VS_NQ=m
-CONFIG_NF_CONNTRACK_IPV4=m
-# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MATCH_AH=m
-CONFIG_IP_NF_MATCH_ECN=m
-CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
-CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
-CONFIG_IP_NF_ARPFILTER=m
-CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
-CONFIG_IP6_NF_MATCH_AH=m
-CONFIG_IP6_NF_MATCH_EUI64=m
-CONFIG_IP6_NF_MATCH_FRAG=m
-CONFIG_IP6_NF_MATCH_OPTS=m
-CONFIG_IP6_NF_MATCH_HL=m
-CONFIG_IP6_NF_MATCH_IPV6HEADER=m
-CONFIG_IP6_NF_MATCH_MH=m
-CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
-CONFIG_IP6_NF_FILTER=m
-CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_BRIDGE_NF_EBTABLES=m
-CONFIG_BRIDGE_EBT_BROUTE=m
-CONFIG_BRIDGE_EBT_T_FILTER=m
-CONFIG_BRIDGE_EBT_T_NAT=m
-CONFIG_BRIDGE_EBT_802_3=m
-CONFIG_BRIDGE_EBT_AMONG=m
-CONFIG_BRIDGE_EBT_ARP=m
-CONFIG_BRIDGE_EBT_IP=m
-CONFIG_BRIDGE_EBT_IP6=m
-CONFIG_BRIDGE_EBT_LIMIT=m
-CONFIG_BRIDGE_EBT_MARK=m
-CONFIG_BRIDGE_EBT_PKTTYPE=m
-CONFIG_BRIDGE_EBT_STP=m
-CONFIG_BRIDGE_EBT_VLAN=m
-CONFIG_BRIDGE_EBT_ARPREPLY=m
-CONFIG_BRIDGE_EBT_DNAT=m
-CONFIG_BRIDGE_EBT_MARK_T=m
-CONFIG_BRIDGE_EBT_REDIRECT=m
-CONFIG_BRIDGE_EBT_SNAT=m
-CONFIG_BRIDGE_EBT_LOG=m
-CONFIG_BRIDGE_EBT_ULOG=m
-CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -292,13 +150,13 @@ CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
 CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -317,10 +175,12 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SAS_ATA=y
+CONFIG_ISCSI_TCP=m
 CONFIG_SCSI_MVSAS=y
 # CONFIG_SCSI_MVSAS_DEBUG is not set
 CONFIG_SCSI_MVSAS_TASKLET=y
 CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
 CONFIG_SATA_SIL24=y
 # CONFIG_ATA_SFF is not set
 CONFIG_MD=y
@@ -343,6 +203,12 @@ CONFIG_DM_MULTIPATH_QL=m
 CONFIG_DM_MULTIPATH_ST=m
 CONFIG_DM_DELAY=m
 CONFIG_DM_UEVENT=y
+CONFIG_TARGET_CORE=m
+CONFIG_TCM_IBLOCK=m
+CONFIG_TCM_FILEIO=m
+CONFIG_TCM_PSCSI=m
+CONFIG_LOOPBACK_TARGET=m
+CONFIG_ISCSI_TARGET=m
 CONFIG_FUSION=y
 CONFIG_FUSION_SAS=y
 CONFIG_NETDEVICES=y
@@ -359,42 +225,8 @@ CONFIG_VETH=m
 CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
-# CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_TILE_NET is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_SKY2=y
+CONFIG_PTP_1588_CLOCK_TILEGX=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -402,6 +234,7 @@ CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_TILEGX=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_TIMERIOMEM=m
 CONFIG_I2C=y
@@ -410,13 +243,16 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
+CONFIG_DRM=m
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_VIA=m
+CONFIG_DRM_SAVAGE=m
 CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
@@ -464,9 +300,8 @@ CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -519,25 +354,28 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_KGDB=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -546,7 +384,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -559,14 +396,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index dd2b8f0..80fc32e 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -1,15 +1,14 @@
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -17,14 +16,13 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEMCG=y
-CONFIG_CGROUP_MEMCG_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
@@ -44,11 +42,10 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -122,16 +119,15 @@ CONFIG_NF_CONNTRACK_PPTP=m
 CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
 CONFIG_NETFILTER_XT_TARGET_DSCP=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
@@ -189,14 +185,12 @@ CONFIG_IP_VS_SED=m
 CONFIG_IP_VS_NQ=m
 CONFIG_NF_CONNTRACK_IPV4=m
 # CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_TTL=m
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
 CONFIG_IP_NF_TARGET_ULOG=m
 CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
@@ -207,8 +201,6 @@ CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
 CONFIG_IP6_NF_MATCH_FRAG=m
@@ -218,7 +210,6 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_MH=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
 CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
@@ -249,7 +240,6 @@ CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -297,6 +287,7 @@ CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -354,40 +345,7 @@ CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_E1000E=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -403,7 +361,6 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
@@ -448,13 +405,13 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
+CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -508,26 +465,29 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 CONFIG_FRAME_WARN=2048
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -536,7 +496,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -549,14 +508,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index d221f8d..d4e10d5 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -26,3 +26,8 @@ config TILE_GXIO_TRIO
 config TILE_GXIO_USB_HOST
 	bool
 	select TILE_GXIO
+
+# Support direct access to the TILE-Gx UART hardware from kernel space.
+config TILE_GXIO_UART
+	bool
+	select TILE_GXIO
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index 8684bca..26ae2c7 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
 obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
 obj-$(CONFIG_TILE_GXIO_MPIPE) += mpipe.o iorpc_mpipe.o iorpc_mpipe_info.o
 obj-$(CONFIG_TILE_GXIO_TRIO) += trio.o iorpc_trio.o
+obj-$(CONFIG_TILE_GXIO_UART) += uart.o iorpc_uart.o
 obj-$(CONFIG_TILE_GXIO_USB_HOST) += usb_host.o iorpc_usb_host.o
diff --git a/arch/tile/gxio/iorpc_trio.c b/arch/tile/gxio/iorpc_trio.c
index cef4b22..da6e18e 100644
--- a/arch/tile/gxio/iorpc_trio.c
+++ b/arch/tile/gxio/iorpc_trio.c
@@ -61,6 +61,29 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,
 
 EXPORT_SYMBOL(gxio_trio_alloc_memory_maps);
 
+struct alloc_scatter_queues_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	struct alloc_scatter_queues_param temp;
+	struct alloc_scatter_queues_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_scatter_queues);
 
 struct alloc_pio_regions_param {
 	unsigned int count;
diff --git a/arch/tile/gxio/iorpc_uart.c b/arch/tile/gxio/iorpc_uart.c
new file mode 100644
index 0000000..b9a6d61
--- /dev/null
+++ b/arch/tile/gxio/iorpc_uart.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_uart.h"
+
+struct cfg_interrupt_param {
+	union iorpc_interrupt interrupt;
+};
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event)
+{
+	struct cfg_interrupt_param temp;
+	struct cfg_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CFG_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_uart_cfg_interrupt);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_UART_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_uart_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_uart_check_mmio_offset);
diff --git a/arch/tile/gxio/uart.c b/arch/tile/gxio/uart.c
new file mode 100644
index 0000000..ba58517
--- /dev/null
+++ b/arch/tile/gxio/uart.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of UART gxio calls.
+ */
+
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+#include <gxio/uart.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_uart.h>
+#include <gxio/kiorpc.h>
+
+int gxio_uart_init(gxio_uart_context_t *context, int uart_index)
+{
+	char file[32];
+	int fd;
+
+	snprintf(file, sizeof(file), "uart/%d/iorpc", uart_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	/* Map in the MMIO space. */
+	context->mmio_base = (void __force *)
+		iorpc_ioremap(fd, HV_UART_MMIO_OFFSET, HV_UART_MMIO_SIZE);
+
+	if (context->mmio_base == NULL) {
+		hv_dev_close(context->fd);
+		context->fd = -1;
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_init);
+
+int gxio_uart_destroy(gxio_uart_context_t *context)
+{
+	iounmap((void __force __iomem *)(context->mmio_base));
+	hv_dev_close(context->fd);
+
+	context->mmio_base = NULL;
+	context->fd = -1;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_destroy);
+
+/* UART register write wrapper. */
+void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+		     uint64_t word)
+{
+	__gxio_mmio_write(context->mmio_base + offset, word);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_write);
+
+/* UART register read wrapper. */
+uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset)
+{
+	return __gxio_mmio_read(context->mmio_base + offset);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_read);
diff --git a/arch/tile/include/arch/trio.h b/arch/tile/include/arch/trio.h
index d3000a8..c0ddedc 100644
--- a/arch/tile/include/arch/trio.h
+++ b/arch/tile/include/arch/trio.h
@@ -23,6 +23,45 @@
 #ifndef __ASSEMBLER__
 
 /*
+ * Map SQ Doorbell Format.
+ * This describes the format of the write-only doorbell register that exists
+ * in the last 8-bytes of the MAP_SQ_BASE/LIM range.  This register is only
+ * writable from PCIe space.  Writes to this register will not be written to
+ * Tile memory space and thus no IO VA translation is required if the last
+ * page of the BASE/LIM range is not otherwise written.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * When written with a 1, the associated MAP_SQ region's doorbell
+     * interrupt will be triggered once all previous writes are visible to
+     * Tile software.
+     */
+    uint_reg_t doorbell   : 1;
+    /*
+     * When written with a 1, the descriptor at the head of the associated
+     * MAP_SQ's FIFO will be dequeued.
+     */
+    uint_reg_t pop        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved : 62;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 62;
+    uint_reg_t pop        : 1;
+    uint_reg_t doorbell   : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_MAP_SQ_DOORBELL_FMT_t;
+
+
+/*
  * Tile PIO Region Configuration - CFG Address Format.
  * This register describes the address format for PIO accesses when the
  * associated region is setup with TYPE=CFG.
diff --git a/arch/tile/include/arch/uart.h b/arch/tile/include/arch/uart.h
new file mode 100644
index 0000000..0796697
--- /dev/null
+++ b/arch/tile/include/arch/uart.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_H__
+#define __ARCH_UART_H__
+
+#include <arch/abi.h>
+#include <arch/uart_def.h>
+
+#ifndef __ASSEMBLER__
+
+/* Divisor. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Baud Rate Divisor.  Desired_baud_rate = REF_CLK frequency / (baud *
+     * 16).
+     *                       Note: REF_CLK is always 125 MHz, the default
+     * divisor = 68, baud rate = 125M/(68*16) = 115200 baud.
+     */
+    uint_reg_t divisor    : 12;
+    /* Reserved. */
+    uint_reg_t __reserved : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 52;
+    uint_reg_t divisor    : 12;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_DIVISOR_t;
+
+/* FIFO Count. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * n: n active entries in the receive FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the receive FIFO (that is empty).
+     */
+    uint_reg_t rfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /*
+     * n: n active entries in the transmit FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the transmit FIFO (that is empty).
+     */
+    uint_reg_t tfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 7;
+    /*
+     * n: n active entries in the write FIFO (max is 2**2). Each entry has 8
+     * bits.
+     * 0: no active entry in the write FIFO (that is empty).
+     */
+    uint_reg_t wfifo_count  : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 29;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 29;
+    uint_reg_t wfifo_count  : 3;
+    uint_reg_t __reserved_1 : 7;
+    uint_reg_t tfifo_count  : 9;
+    uint_reg_t __reserved_0 : 7;
+    uint_reg_t rfifo_count  : 9;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FIFO_COUNT_t;
+
+/* FLAG. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* 1: receive FIFO is empty */
+    uint_reg_t rfifo_empty  : 1;
+    /* 1: write FIFO is empty. */
+    uint_reg_t wfifo_empty  : 1;
+    /* 1: transmit FIFO is empty. */
+    uint_reg_t tfifo_empty  : 1;
+    /* 1: receive FIFO is full. */
+    uint_reg_t rfifo_full   : 1;
+    /* 1: write FIFO is full. */
+    uint_reg_t wfifo_full   : 1;
+    /* 1: transmit FIFO is full. */
+    uint_reg_t tfifo_full   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1 : 57;
+    uint_reg_t tfifo_full   : 1;
+    uint_reg_t wfifo_full   : 1;
+    uint_reg_t rfifo_full   : 1;
+    uint_reg_t tfifo_empty  : 1;
+    uint_reg_t wfifo_empty  : 1;
+    uint_reg_t rfifo_empty  : 1;
+    uint_reg_t __reserved_0 : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FLAG_t;
+
+/*
+ * Interrupt Vector Mask.
+ * Each bit in this register corresponds to a specific interrupt. When set,
+ * the associated interrupt will not be dispatched.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * An almost full event is reached when data is to be written to the
+     * receive FIFO, and the receive FIFO has more than or equal to
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes.
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * An almost empty event is reached when data is to be read from the
+     * transmit FIFO, and the transmit FIFO has less than or equal to
+     * BUFFER_THRESHOLD.TFIFO_AEMPTY bytes.
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_MASK_t;
+
+/*
+ * Interrupt vector, write-one-to-clear.
+ * Each bit in this register corresponds to a specific interrupt. Hardware
+ * sets the bit when the associated condition has occurred. Writing a 1
+ * clears the status bit.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * Data was received and the receive FIFO is now almost full (more than
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes in it)
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * Data was read from the transmit FIFO and now it is almost empty (less
+     * than or equal to BUFFER_THRESHOLD.TFIFO_AEMPTY bytes in it).
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_STATUS_t;
+
+/* Type. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Number of stop bits, rx and tx */
+    uint_reg_t sbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* Data word size, rx and tx */
+    uint_reg_t dbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 1;
+    /* Parity selection, rx and tx */
+    uint_reg_t ptype        : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 57;
+    uint_reg_t ptype        : 3;
+    uint_reg_t __reserved_1 : 1;
+    uint_reg_t dbits        : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t sbits        : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_TYPE_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_UART_H__) */
diff --git a/arch/tile/include/arch/uart_def.h b/arch/tile/include/arch/uart_def.h
new file mode 100644
index 0000000..42bcaf5
--- /dev/null
+++ b/arch/tile/include/arch/uart_def.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_DEF_H__
+#define __ARCH_UART_DEF_H__
+#define UART_DIVISOR 0x0158
+#define UART_FIFO_COUNT 0x0110
+#define UART_FLAG 0x0108
+#define UART_INTERRUPT_MASK 0x0208
+#define UART_INTERRUPT_MASK__RDAT_ERR_SHIFT 0
+#define UART_INTERRUPT_MASK__RDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_MASK  0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_FIELD 0,0
+#define UART_INTERRUPT_MASK__WDAT_ERR_SHIFT 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__WDAT_ERR_MASK  0x2
+#define UART_INTERRUPT_MASK__WDAT_ERR_FIELD 1,1
+#define UART_INTERRUPT_MASK__FRAME_ERR_SHIFT 2
+#define UART_INTERRUPT_MASK__FRAME_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__FRAME_ERR_MASK  0x4
+#define UART_INTERRUPT_MASK__FRAME_ERR_FIELD 2,2
+#define UART_INTERRUPT_MASK__PARITY_ERR_SHIFT 3
+#define UART_INTERRUPT_MASK__PARITY_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__PARITY_ERR_MASK  0x8
+#define UART_INTERRUPT_MASK__PARITY_ERR_FIELD 3,3
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_SHIFT 4
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_MASK  0x10
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_FIELD 4,4
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_SHIFT 5
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_MASK  0x20
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_FIELD 5,5
+#define UART_INTERRUPT_MASK__TFIFO_RE_SHIFT 7
+#define UART_INTERRUPT_MASK__TFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_RE_MASK  0x80
+#define UART_INTERRUPT_MASK__TFIFO_RE_FIELD 7,7
+#define UART_INTERRUPT_MASK__RFIFO_WE_SHIFT 8
+#define UART_INTERRUPT_MASK__RFIFO_WE_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_WE_MASK  0x100
+#define UART_INTERRUPT_MASK__RFIFO_WE_FIELD 8,8
+#define UART_INTERRUPT_MASK__WFIFO_RE_SHIFT 9
+#define UART_INTERRUPT_MASK__WFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__WFIFO_RE_MASK  0x200
+#define UART_INTERRUPT_MASK__WFIFO_RE_FIELD 9,9
+#define UART_INTERRUPT_MASK__RFIFO_ERR_SHIFT 10
+#define UART_INTERRUPT_MASK__RFIFO_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_MASK  0x400
+#define UART_INTERRUPT_MASK__RFIFO_ERR_FIELD 10,10
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_SHIFT 11
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_MASK  0x800
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_FIELD 11,11
+#define UART_INTERRUPT_STATUS 0x0200
+#define UART_RECEIVE_DATA 0x0148
+#define UART_TRANSMIT_DATA 0x0140
+#define UART_TYPE 0x0160
+#define UART_TYPE__SBITS_SHIFT 0
+#define UART_TYPE__SBITS_WIDTH 1
+#define UART_TYPE__SBITS_RESET_VAL 1
+#define UART_TYPE__SBITS_RMASK 0x1
+#define UART_TYPE__SBITS_MASK  0x1
+#define UART_TYPE__SBITS_FIELD 0,0
+#define UART_TYPE__SBITS_VAL_ONE_SBITS 0x0
+#define UART_TYPE__SBITS_VAL_TWO_SBITS 0x1
+#define UART_TYPE__DBITS_SHIFT 2
+#define UART_TYPE__DBITS_WIDTH 1
+#define UART_TYPE__DBITS_RESET_VAL 0
+#define UART_TYPE__DBITS_RMASK 0x1
+#define UART_TYPE__DBITS_MASK  0x4
+#define UART_TYPE__DBITS_FIELD 2,2
+#define UART_TYPE__DBITS_VAL_EIGHT_DBITS 0x0
+#define UART_TYPE__DBITS_VAL_SEVEN_DBITS 0x1
+#define UART_TYPE__PTYPE_SHIFT 4
+#define UART_TYPE__PTYPE_WIDTH 3
+#define UART_TYPE__PTYPE_RESET_VAL 3
+#define UART_TYPE__PTYPE_RMASK 0x7
+#define UART_TYPE__PTYPE_MASK  0x70
+#define UART_TYPE__PTYPE_FIELD 4,6
+#define UART_TYPE__PTYPE_VAL_NONE 0x0
+#define UART_TYPE__PTYPE_VAL_MARK 0x1
+#define UART_TYPE__PTYPE_VAL_SPACE 0x2
+#define UART_TYPE__PTYPE_VAL_EVEN 0x3
+#define UART_TYPE__PTYPE_VAL_ODD 0x4
+#endif /* !defined(__ARCH_UART_DEF_H__) */
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index b17b9b8..664d6ad 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -11,12 +11,13 @@ generic-y += errno.h
 generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
+generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
-generic-y += kdebug.h
 generic-y += local.h
+generic-y += local64.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h
index e71387a..d385eaa 100644
--- a/arch/tile/include/asm/atomic.h
+++ b/arch/tile/include/asm/atomic.h
@@ -114,6 +114,32 @@ static inline int atomic_read(const atomic_t *v)
 #define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
 
 /**
+ * atomic_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline int atomic_xchg(atomic_t *v, int n)
+{
+	return xchg(&v->counter, n);
+}
+
+/**
+ * atomic_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
+{
+	return cmpxchg(&v->counter, o, n);
+}
+
+/**
  * atomic_add_negative - add and test if negative
  * @v: pointer of type atomic_t
  * @i: integer value to add
@@ -133,6 +159,32 @@ static inline int atomic_read(const atomic_t *v)
 
 #ifndef __ASSEMBLY__
 
+/**
+ * atomic64_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic64_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline u64 atomic64_xchg(atomic64_t *v, u64 n)
+{
+	return xchg64(&v->counter, n);
+}
+
+/**
+ * atomic64_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic64_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
+
 static inline long long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long long c, old, dec;
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index e7fb5cf..0d0395b 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -22,40 +22,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* Tile-specific routines to support <linux/atomic.h>. */
-int _atomic_xchg(atomic_t *v, int n);
-int _atomic_xchg_add(atomic_t *v, int i);
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u);
-int _atomic_cmpxchg(atomic_t *v, int o, int n);
-
-/**
- * atomic_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg(v, n);
-}
-
-/**
- * atomic_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_cmpxchg(v, o, n);
-}
-
 /**
  * atomic_add - add integer to atomic variable
  * @i: integer value to add
@@ -65,7 +31,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
  */
 static inline void atomic_add(int i, atomic_t *v)
 {
-	_atomic_xchg_add(v, i);
+	_atomic_xchg_add(&v->counter, i);
 }
 
 /**
@@ -78,7 +44,7 @@ static inline void atomic_add(int i, atomic_t *v)
 static inline int atomic_add_return(int i, atomic_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add(v, i) + i;
+	return _atomic_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -93,7 +59,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add_unless(v, a, u);
+	return _atomic_xchg_add_unless(&v->counter, a, u);
 }
 
 /**
@@ -108,7 +74,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  */
 static inline void atomic_set(atomic_t *v, int n)
 {
-	_atomic_xchg(v, n);
+	_atomic_xchg(&v->counter, n);
 }
 
 /* A 64bit atomic type */
@@ -119,11 +85,6 @@ typedef struct {
 
 #define ATOMIC64_INIT(val) { (val) }
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n);
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i);
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u);
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n);
-
 /**
  * atomic64_read - read atomic variable
  * @v: pointer of type atomic64_t
@@ -137,35 +98,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
 	 * Casting away const is safe since the atomic support routines
 	 * do not write to memory if the value has not been modified.
 	 */
-	return _atomic64_xchg_add((atomic64_t *)v, 0);
-}
-
-/**
- * atomic64_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic64_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline u64 atomic64_xchg(atomic64_t *v, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg(v, n);
-}
-
-/**
- * atomic64_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic64_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_cmpxchg(v, o, n);
+	return _atomic64_xchg_add((u64 *)&v->counter, 0);
 }
 
 /**
@@ -177,7 +110,7 @@ static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
  */
 static inline void atomic64_add(u64 i, atomic64_t *v)
 {
-	_atomic64_xchg_add(v, i);
+	_atomic64_xchg_add(&v->counter, i);
 }
 
 /**
@@ -190,7 +123,7 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
 static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add(v, i) + i;
+	return _atomic64_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -205,7 +138,7 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
 static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add_unless(v, a, u) != u;
+	return _atomic64_xchg_add_unless(&v->counter, a, u) != u;
 }
 
 /**
@@ -220,7 +153,7 @@ static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
  */
 static inline void atomic64_set(atomic64_t *v, u64 n)
 {
-	_atomic64_xchg(v, n);
+	_atomic64_xchg(&v->counter, n);
 }
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
@@ -252,21 +185,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
  * Internal definitions only beyond this point.
  */
 
-#define ATOMIC_LOCKS_FOUND_VIA_TABLE() \
-  (!CHIP_HAS_CBOX_HOME_MAP() && defined(CONFIG_SMP))
-
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/* Number of entries in atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L1_SHIFT 6
-#define ATOMIC_HASH_L1_SIZE (1 << ATOMIC_HASH_L1_SHIFT)
-
-/* Number of locks in each struct pointed to by atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L2_SHIFT (CHIP_L2_LOG_LINE_SIZE() - 2)
-#define ATOMIC_HASH_L2_SIZE (1 << ATOMIC_HASH_L2_SHIFT)
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * Number of atomic locks in atomic_locks[]. Must be a power of two.
  * There is no reason for more than PAGE_SIZE / 8 entries, since that
@@ -281,8 +199,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
 extern int atomic_locks[];
 #endif
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * All the code that may fault while holding an atomic lock must
  * place the pointer to the lock in ATOMIC_LOCK_REG so the fault code
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index f4500c6..ad220ee 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -32,25 +32,6 @@
  * on any routine which updates memory and returns a value.
  */
 
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	int val;
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_cmpexch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	int val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic_add(int i, atomic_t *v)
 {
 	__insn_fetchadd4((void *)&v->counter, i);
@@ -72,7 +53,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval;
 }
@@ -84,25 +65,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 #define atomic64_read(v)		((v)->counter)
 #define atomic64_set(v, i) ((v)->counter = (i))
 
-static inline long atomic64_cmpxchg(atomic64_t *v, long o, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	val = __insn_cmpexch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline long atomic64_xchg(atomic64_t *v, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic64_add(long i, atomic64_t *v)
 {
 	__insn_fetchadd((void *)&v->counter, i);
@@ -124,7 +86,7 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic64_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval != u;
 }
diff --git a/arch/tile/include/asm/barrier.h b/arch/tile/include/asm/barrier.h
index 990a217..a9a73da 100644
--- a/arch/tile/include/asm/barrier.h
+++ b/arch/tile/include/asm/barrier.h
@@ -77,7 +77,6 @@
 
 #define __sync()	__insn_mf()
 
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 #include <hv/syscall_public.h>
 /*
  * Issue an uncacheable load to each memory controller, then
@@ -96,7 +95,6 @@ static inline void __mb_incoherent(void)
 		       "r20", "r21", "r22", "r23", "r24",
 		       "r25", "r26", "r27", "r28", "r29");
 }
-#endif
 
 /* Fence to guarantee visibility of stores to incoherent memory. */
 static inline void
@@ -104,7 +102,6 @@ mb_incoherent(void)
 {
 	__insn_mf();
 
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 	{
 #if CHIP_HAS_TILE_WRITE_PENDING()
 		const unsigned long WRITE_TIMEOUT_CYCLES = 400;
@@ -116,7 +113,6 @@ mb_incoherent(void)
 #endif /* CHIP_HAS_TILE_WRITE_PENDING() */
 		(void) __mb_incoherent();
 	}
-#endif /* CHIP_HAS_MF_WAITS_FOR_VICTIMS() */
 }
 
 #define fast_wmb()	__sync()
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index bd186c4..d5a2068 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -29,17 +29,6 @@
 #endif
 
 /**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-	return __builtin_ctzl(word);
-}
-
-/**
  * ffz - find first zero bit in word
  * @word: The word to search
  *
@@ -50,33 +39,6 @@ static inline unsigned long ffz(unsigned long word)
 	return __builtin_ctzl(~word);
 }
 
-/**
- * __fls - find last set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
-	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
-}
-
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static inline int ffs(int x)
-{
-	return __builtin_ffs(x);
-}
-
 static inline int fls64(__u64 w)
 {
 	return (sizeof(__u64) * 8) - __builtin_clzll(w);
@@ -118,6 +80,9 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return __builtin_popcountll(w);
 }
 
+#include <asm-generic/bitops/builtin-__ffs.h>
+#include <asm-generic/bitops/builtin-__fls.h>
+#include <asm-generic/bitops/builtin-ffs.h>
 #include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h
index ddc4c1e..386865ad 100644
--- a/arch/tile/include/asm/bitops_32.h
+++ b/arch/tile/include/asm/bitops_32.h
@@ -16,7 +16,7 @@
 #define _ASM_TILE_BITOPS_32_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
+#include <asm/barrier.h>
 
 /* Tile-specific routines to support <asm/bitops.h>. */
 unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask);
diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h
index 60b87ee..ad34cd0 100644
--- a/arch/tile/include/asm/bitops_64.h
+++ b/arch/tile/include/asm/bitops_64.h
@@ -16,7 +16,7 @@
 #define _ASM_TILE_BITOPS_64_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
+#include <asm/cmpxchg.h>
 
 /* See <asm/bitops.h> for API comments. */
 
@@ -44,8 +44,7 @@ static inline void change_bit(unsigned nr, volatile unsigned long *addr)
 	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 }
 
@@ -90,8 +89,7 @@ static inline int test_and_change_bit(unsigned nr,
 	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 	return (oldval & mask) != 0;
 }
diff --git a/arch/tile/include/asm/cache.h b/arch/tile/include/asm/cache.h
index a9a5299..6160761 100644
--- a/arch/tile/include/asm/cache.h
+++ b/arch/tile/include/asm/cache.h
@@ -49,9 +49,16 @@
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 /*
- * Attribute for data that is kept read/write coherent until the end of
- * initialization, then bumped to read/only incoherent for performance.
+ * Originally we used small TLB pages for kernel data and grouped some
+ * things together as "write once", enforcing the property at the end
+ * of initialization by making those pages read-only and non-coherent.
+ * This allowed better cache utilization since cache inclusion did not
+ * need to be maintained.  However, to do this requires an extra TLB
+ * entry, which on balance is more of a performance hit than the
+ * non-coherence is a performance gain, so we now just make "read
+ * mostly" and "write once" be synonyms.  We keep the attribute
+ * separate in case we change our minds at a future date.
  */
-#define __write_once __attribute__((__section__(".w1data")))
+#define __write_once __read_mostly
 
 #endif /* _ASM_TILE_CACHE_H */
diff --git a/arch/tile/include/asm/cacheflush.h b/arch/tile/include/asm/cacheflush.h
index 0fc63c4..92ee4c8 100644
--- a/arch/tile/include/asm/cacheflush.h
+++ b/arch/tile/include/asm/cacheflush.h
@@ -75,23 +75,6 @@ static inline void copy_to_user_page(struct vm_area_struct *vma,
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy((dst), (src), (len))
 
-/*
- * Invalidate a VA range; pads to L2 cacheline boundaries.
- *
- * Note that on TILE64, __inv_buffer() actually flushes modified
- * cache lines in addition to invalidating them, i.e., it's the
- * same as __finv_buffer().
- */
-static inline void __inv_buffer(void *buffer, size_t size)
-{
-	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
-	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
-	while (next < finish) {
-		__insn_inv(next);
-		next += CHIP_INV_STRIDE();
-	}
-}
-
 /* Flush a VA range; pads to L2 cacheline boundaries. */
 static inline void __flush_buffer(void *buffer, size_t size)
 {
@@ -115,13 +98,6 @@ static inline void __finv_buffer(void *buffer, size_t size)
 }
 
 
-/* Invalidate a VA range and wait for it to be complete. */
-static inline void inv_buffer(void *buffer, size_t size)
-{
-	__inv_buffer(buffer, size);
-	mb();
-}
-
 /*
  * Flush a locally-homecached VA range and wait for the evicted
  * cachelines to hit memory.
@@ -142,6 +118,26 @@ static inline void finv_buffer_local(void *buffer, size_t size)
 	mb_incoherent();
 }
 
+#ifdef __tilepro__
+/* Invalidate a VA range; pads to L2 cacheline boundaries. */
+static inline void __inv_buffer(void *buffer, size_t size)
+{
+	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
+	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
+	while (next < finish) {
+		__insn_inv(next);
+		next += CHIP_INV_STRIDE();
+	}
+}
+
+/* Invalidate a VA range and wait for it to be complete. */
+static inline void inv_buffer(void *buffer, size_t size)
+{
+	__inv_buffer(buffer, size);
+	mb();
+}
+#endif
+
 /*
  * Flush and invalidate a VA range that is homed remotely, waiting
  * until the memory controller holds the flushed values.  If "hfh" is
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
index 276f067..4001d5e 100644
--- a/arch/tile/include/asm/cmpxchg.h
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -20,53 +20,108 @@
 
 #ifndef __ASSEMBLY__
 
-/* Nonexistent functions intended to cause link errors. */
-extern unsigned long __xchg_called_with_bad_pointer(void);
-extern unsigned long __cmpxchg_called_with_bad_pointer(void);
+#include <asm/barrier.h>
 
-#define xchg(ptr, x)							\
+/* Nonexistent functions intended to cause compile errors. */
+extern void __xchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for xchg");
+extern void __cmpxchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for cmpxchg");
+
+#ifndef __tilegx__
+
+/* Note the _atomic_xxx() routines include a final mb(). */
+int _atomic_xchg(int *ptr, int n);
+int _atomic_xchg_add(int *v, int i);
+int _atomic_xchg_add_unless(int *v, int a, int u);
+int _atomic_cmpxchg(int *ptr, int o, int n);
+u64 _atomic64_xchg(u64 *v, u64 n);
+u64 _atomic64_xchg_add(u64 *v, u64 i);
+u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u);
+u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n);
+
+#define xchg(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_xchg((int *)(ptr), (int)(n));	\
+	})
+
+#define cmpxchg(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_cmpxchg((int *)ptr, (int)o, (int)n); \
+	})
+
+#define xchg64(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_xchg((u64 *)(ptr), (u64)(n));	\
+	})
+
+#define cmpxchg64(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_cmpxchg((u64 *)ptr, (u64)o, (u64)n); \
+	})
+
+#else
+
+#define xchg(ptr, n)							\
 	({								\
 		typeof(*(ptr)) __x;					\
+		smp_mb();						\
 		switch (sizeof(*(ptr))) {				\
 		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_xchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((x)-(x)))(x));		\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_exch4((ptr), (u32)(unsigned long)(n)); \
 			break;						\
 		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_xchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((x)-(x)))(x));		\
+			__x = (typeof(__x))			\
+				__insn_exch((ptr), (unsigned long)(n));	\
 			break;						\
 		default:						\
 			__xchg_called_with_bad_pointer();		\
+			break;						\
 		}							\
+		smp_mb();						\
 		__x;							\
 	})
 
 #define cmpxchg(ptr, o, n)						\
 	({								\
 		typeof(*(ptr)) __x;					\
+		__insn_mtspr(SPR_CMPEXCH_VALUE, (unsigned long)(o));	\
+		smp_mb();						\
 		switch (sizeof(*(ptr))) {				\
 		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_cmpxchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((o)-(o)))(o),		\
-				(u32)(typeof((n)-(n)))(n));		\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_cmpexch4((ptr), (u32)(unsigned long)(n)); \
 			break;						\
 		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_cmpxchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((o)-(o)))(o),		\
-				(u64)(typeof((n)-(n)))(n));		\
+			__x = (typeof(__x))__insn_cmpexch((ptr), (u64)(n)); \
 			break;						\
 		default:						\
 			__cmpxchg_called_with_bad_pointer();		\
+			break;						\
 		}							\
+		smp_mb();						\
 		__x;							\
 	})
 
-#define tas(ptr) (xchg((ptr), 1))
+#define xchg64 xchg
+#define cmpxchg64 cmpxchg
+
+#endif
+
+#define tas(ptr) xchg((ptr), 1)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
index 5182705..6ab8bf1 100644
--- a/arch/tile/include/asm/device.h
+++ b/arch/tile/include/asm/device.h
@@ -23,7 +23,10 @@ struct dev_archdata {
 	/* Offset of the DMA address from the PA. */
 	dma_addr_t		dma_offset;
 
-	/* Highest DMA address that can be generated by this device. */
+	/*
+	 * Highest DMA address that can be generated by devices that
+	 * have limited DMA capability, i.e. non 64-bit capable.
+	 */
 	dma_addr_t		max_direct_dma_addr;
 };
 
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index f2ff191..1eae359 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -20,9 +20,14 @@
 #include <linux/cache.h>
 #include <linux/io.h>
 
+#ifdef __tilegx__
+#define ARCH_HAS_DMA_GET_REQUIRED_MASK
+#endif
+
 extern struct dma_map_ops *tile_dma_map_ops;
 extern struct dma_map_ops *gx_pci_dma_map_ops;
 extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
@@ -44,12 +49,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	return paddr + get_dma_offset(dev);
+	return paddr;
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	return daddr - get_dma_offset(dev);
+	return daddr;
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
@@ -87,11 +92,19 @@ dma_set_mask(struct device *dev, u64 mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	/* Handle legacy PCI devices with limited memory addressability. */
-	if ((dma_ops == gx_pci_dma_map_ops) && (mask <= DMA_BIT_MASK(32))) {
-		set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
-		set_dma_offset(dev, 0);
-		if (mask > dev->archdata.max_direct_dma_addr)
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to hybrid, with the consistent memory DMA space limited
+	 * to 32-bit. For 32-bit capable devices, limit the streaming DMA
+	 * address range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64) &&
+		    dma_ops == gx_legacy_pci_dma_map_ops)
+			set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
 			mask = dev->archdata.max_direct_dma_addr;
 	}
 
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index ff8a934..41d9878 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -30,7 +30,6 @@ typedef unsigned long elf_greg_t;
 #define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
-#define EM_TILE64  187
 #define EM_TILEPRO 188
 #define EM_TILEGX  191
 
@@ -132,6 +131,15 @@ extern int dump_task_regs(struct task_struct *, elf_gregset_t *);
 struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int executable_stack);
+#define ARCH_DLINFO \
+do { \
+	NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE); \
+} while (0)
+
+struct mm_struct;
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
 #ifdef CONFIG_COMPAT
 
 #define COMPAT_ELF_PLATFORM "tilegx-m32"
diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h
index e16dbf9..c6b9c1b 100644
--- a/arch/tile/include/asm/fixmap.h
+++ b/arch/tile/include/asm/fixmap.h
@@ -78,14 +78,6 @@ enum fixed_addresses {
 #endif
 };
 
-extern void __set_fixmap(enum fixed_addresses idx,
-			 unsigned long phys, pgprot_t flags);
-
-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
-#define clear_fixmap(idx) \
-		__set_fixmap(idx, 0, __pgprot(0))
-
 #define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
 #define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START		(FIXADDR_TOP + PAGE_SIZE - __FIXADDR_SIZE)
diff --git a/arch/tile/include/asm/ftrace.h b/arch/tile/include/asm/ftrace.h
index 461459b..13a9bb8 100644
--- a/arch/tile/include/asm/ftrace.h
+++ b/arch/tile/include/asm/ftrace.h
@@ -15,6 +15,26 @@
 #ifndef _ASM_TILE_FTRACE_H
 #define _ASM_TILE_FTRACE_H
 
-/* empty */
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define MCOUNT_ADDR ((unsigned long)(__mcount))
+#define MCOUNT_INSN_SIZE 8		/* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void __mcount(void);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #endif /* _ASM_TILE_FTRACE_H */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index 5909ac3..1a6ef1b 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -43,6 +43,7 @@
 	    ".pushsection .fixup,\"ax\"\n"			\
 	    "0: { movei %0, %5; j 9f }\n"			\
 	    ".section __ex_table,\"a\"\n"			\
+	    ".align 8\n"					\
 	    ".quad 1b, 0b\n"					\
 	    ".popsection\n"					\
 	    "9:"						\
diff --git a/arch/tile/include/asm/homecache.h b/arch/tile/include/asm/homecache.h
index 7b77713..7ddd1b8 100644
--- a/arch/tile/include/asm/homecache.h
+++ b/arch/tile/include/asm/homecache.h
@@ -33,8 +33,7 @@ struct zone;
 
 /*
  * Is this page immutable (unwritable) and thus able to be cached more
- * widely than would otherwise be possible?  On tile64 this means we
- * mark the PTE to cache locally; on tilepro it means we have "nc" set.
+ * widely than would otherwise be possible?  This means we have "nc" set.
  */
 #define PAGE_HOME_IMMUTABLE -2
 
@@ -44,16 +43,8 @@ struct zone;
  */
 #define PAGE_HOME_INCOHERENT -3
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Home for the page is distributed via hash-for-home. */
 #define PAGE_HOME_HASH -4
-#endif
-
-/* Homing is unknown or unspecified.  Not valid for page_home(). */
-#define PAGE_HOME_UNKNOWN -5
-
-/* Home on the current cpu.  Not valid for page_home(). */
-#define PAGE_HOME_HERE -6
 
 /* Support wrapper to use instead of explicit hv_flush_remote(). */
 extern void flush_remote(unsigned long cache_pfn, unsigned long cache_length,
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 3167291..9fe4349 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -19,7 +19,8 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
-#define IO_SPACE_LIMIT 0xfffffffful
+/* Maximum PCI I/O space address supported. */
+#define IO_SPACE_LIMIT 0xffffffff
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
@@ -254,7 +255,7 @@ static inline void writeq(u64 val, unsigned long addr)
 
 static inline void memset_io(volatile void *dst, int val, size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	val = (val & 0xff) * 0x01010101;
 	for (x = 0; x < len; x += 4)
@@ -264,7 +265,7 @@ static inline void memset_io(volatile void *dst, int val, size_t len)
 static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 				 size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)src & 0x3);
 	for (x = 0; x < len; x += 4)
 		*(u32 *)(dst + x) = readl(src + x);
@@ -273,7 +274,7 @@ static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 				size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	for (x = 0; x < len; x += 4)
 		writel(*(u32 *)(src + x), dst + x);
@@ -281,8 +282,108 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 
 #endif
 
+#if CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO)
+
+static inline u8 inb(unsigned long addr)
+{
+	return readb((volatile void __iomem *) addr);
+}
+
+static inline u16 inw(unsigned long addr)
+{
+	return readw((volatile void __iomem *) addr);
+}
+
+static inline u32 inl(unsigned long addr)
+{
+	return readl((volatile void __iomem *) addr);
+}
+
+static inline void outb(u8 b, unsigned long addr)
+{
+	writeb(b, (volatile void __iomem *) addr);
+}
+
+static inline void outw(u16 b, unsigned long addr)
+{
+	writew(b, (volatile void __iomem *) addr);
+}
+
+static inline void outl(u32 b, unsigned long addr)
+{
+	writel(b, (volatile void __iomem *) addr);
+}
+
+static inline void insb(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u8 *buf = buffer;
+		do {
+			u8 x = inb(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insw(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u16 *buf = buffer;
+		do {
+			u16 x = inw(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insl(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u32 *buf = buffer;
+		do {
+			u32 x = inl(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void outsb(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u8 *buf = buffer;
+		do {
+			outb(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsw(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u16 *buf = buffer;
+		do {
+			outw(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsl(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u32 *buf = buffer;
+		do {
+			outl(*buf++, addr);
+		} while (--count);
+	}
+}
+
+extern void __iomem *ioport_map(unsigned long port, unsigned int len);
+extern void ioport_unmap(void __iomem *addr);
+
+#else
+
 /*
- * The Tile architecture does not support IOPORT, even with PCI.
+ * The TilePro architecture does not support IOPORT, even with PCI.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -290,7 +391,12 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 
 static inline long ioport_panic(void)
 {
+#ifdef __tilegx__
+	panic("PCI IO space support is disabled. Configure the kernel with"
+	      " CONFIG_TILE_PCI_IO to enable it");
+#else
 	panic("inb/outb and friends do not exist on tile");
+#endif
 	return 0;
 }
 
@@ -335,13 +441,6 @@ static inline void outl(u32 b, unsigned long addr)
 	ioport_panic();
 }
 
-#define inb_p(addr)	inb(addr)
-#define inw_p(addr)	inw(addr)
-#define inl_p(addr)	inl(addr)
-#define outb_p(x, addr)	outb((x), (addr))
-#define outw_p(x, addr)	outw((x), (addr))
-#define outl_p(x, addr)	outl((x), (addr))
-
 static inline void insb(unsigned long addr, void *buffer, int count)
 {
 	ioport_panic();
@@ -372,6 +471,15 @@ static inline void outsl(unsigned long addr, const void *buffer, int count)
 	ioport_panic();
 }
 
+#endif /* CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO) */
+
+#define inb_p(addr)	inb(addr)
+#define inw_p(addr)	inw(addr)
+#define inl_p(addr)	inl(addr)
+#define outb_p(x, addr)	outb((x), (addr))
+#define outw_p(x, addr)	outw((x), (addr))
+#define outl_p(x, addr)	outl((x), (addr))
+
 #define ioread16be(addr)	be16_to_cpu(ioread16(addr))
 #define ioread32be(addr)	be32_to_cpu(ioread32(addr))
 #define iowrite16be(v, addr)	iowrite16(be16_to_cpu(v), (addr))
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index c96f9bb..71af574 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -124,6 +124,12 @@
 DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #define INITIAL_INTERRUPTS_ENABLED (1ULL << INT_MEM_ERROR)
 
+#ifdef CONFIG_DEBUG_PREEMPT
+/* Due to inclusion issues, we can't rely on <linux/smp.h> here. */
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#endif
+
 /* Disable interrupts. */
 #define arch_local_irq_disable() \
 	interrupt_mask_set_mask(LINUX_MASKABLE_INTERRUPTS)
@@ -132,9 +138,18 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #define arch_local_irq_disable_all() \
 	interrupt_mask_set_mask(-1ULL)
 
+/*
+ * Read the set of maskable interrupts.
+ * We avoid the preemption warning here via __this_cpu_ptr since even
+ * if irqs are already enabled, it's harmless to read the wrong cpu's
+ * enabled mask.
+ */
+#define arch_local_irqs_enabled() \
+	(*__this_cpu_ptr(&interrupts_enabled_mask))
+
 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
-	interrupt_mask_reset_mask(__get_cpu_var(interrupts_enabled_mask))
+	interrupt_mask_reset_mask(arch_local_irqs_enabled())
 
 /* Disable or enable interrupts based on flag argument. */
 #define arch_local_irq_restore(disabled) do { \
@@ -161,7 +176,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Prevent the given interrupt from being enabled next time we enable irqs. */
 #define arch_local_irq_mask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) &= ~(1ULL << (interrupt)))
+	this_cpu_and(interrupts_enabled_mask, ~(1ULL << (interrupt)))
 
 /* Prevent the given interrupt from being enabled immediately. */
 #define arch_local_irq_mask_now(interrupt) do { \
@@ -171,7 +186,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Allow the given interrupt to be enabled next time we enable irqs. */
 #define arch_local_irq_unmask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) |= (1ULL << (interrupt)))
+	this_cpu_or(interrupts_enabled_mask, (1ULL << (interrupt)))
 
 /* Allow the given interrupt to be enabled immediately, if !irqs_disabled. */
 #define arch_local_irq_unmask_now(interrupt) do { \
diff --git a/arch/tile/include/asm/hw_irq.h b/arch/tile/include/asm/kdebug.h
index 4fac5fbf..5bbbfa9 100644
--- a/arch/tile/include/asm/hw_irq.h
+++ b/arch/tile/include/asm/kdebug.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
@@ -12,7 +12,17 @@
  *   more details.
  */
 
-#ifndef _ASM_TILE_HW_IRQ_H
-#define _ASM_TILE_HW_IRQ_H
+#ifndef _ASM_TILE_KDEBUG_H
+#define _ASM_TILE_KDEBUG_H
 
-#endif /* _ASM_TILE_HW_IRQ_H */
+#include <linux/notifier.h>
+
+enum die_val {
+	DIE_OOPS = 1,
+	DIE_BREAK,
+	DIE_SSTEPBP,
+	DIE_PAGE_FAULT,
+	DIE_COMPILED_BPT
+};
+
+#endif /* _ASM_TILE_KDEBUG_H */
diff --git a/arch/tile/include/asm/kgdb.h b/arch/tile/include/asm/kgdb.h
new file mode 100644
index 0000000..280c181
--- /dev/null
+++ b/arch/tile/include/asm/kgdb.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#ifndef __TILE_KGDB_H__
+#define __TILE_KGDB_H__
+
+#include <linux/kdebug.h>
+#include <arch/opcode.h>
+
+#define GDB_SIZEOF_REG		sizeof(unsigned long)
+
+/*
+ * TILE-Gx gdb is expecting the following register layout:
+ * 56 GPRs(R0 - R52, TP, SP, LR), 8 special GPRs(networks and ZERO),
+ * plus the PC and the faultnum.
+ *
+ * Even though kernel not use the 8 special GPRs, they need to be present
+ * in the registers sent for correct processing in the host-side gdb.
+ *
+ */
+#define DBG_MAX_REG_NUM		(56+8+2)
+#define NUMREGBYTES		(DBG_MAX_REG_NUM * GDB_SIZEOF_REG)
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets,
+ * Longer buffer is needed to list all threads.
+ */
+#define BUFMAX			2048
+
+#define BREAK_INSTR_SIZE	TILEGX_BUNDLE_SIZE_IN_BYTES
+
+/*
+ * Require cache flush for set/clear a software breakpoint or write memory.
+ */
+#define CACHE_FLUSH_IS_SAFE	1
+
+/*
+ * The compiled-in breakpoint instruction can be used to "break" into
+ * the debugger via magic system request key (sysrq-G).
+ */
+static tile_bundle_bits compiled_bpt = TILEGX_BPT_BUNDLE | DIE_COMPILED_BPT;
+
+enum tilegx_regnum {
+	TILEGX_PC_REGNUM = TREG_LAST_GPR + 9,
+	TILEGX_FAULTNUM_REGNUM,
+};
+
+/*
+ * Generate a breakpoint exception to "break" into the debugger.
+ */
+static inline void arch_kgdb_breakpoint(void)
+{
+	asm volatile (".quad %0\n\t"
+		      ::""(compiled_bpt));
+}
+
+#endif /* __TILE_KGDB_H__ */
diff --git a/arch/tile/include/asm/kprobes.h b/arch/tile/include/asm/kprobes.h
new file mode 100644
index 0000000..d8f9a83
--- /dev/null
+++ b/arch/tile/include/asm/kprobes.h
@@ -0,0 +1,79 @@
+/*
+ * arch/tile/include/asm/kprobes.h
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_KPROBES_H
+#define _ASM_TILE_KPROBES_H
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#include <arch/opcode.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE			2
+
+#define kretprobe_blacklist_size 0
+
+typedef tile_bundle_bits kprobe_opcode_t;
+
+#define flush_insn_slot(p)						\
+	flush_icache_range((unsigned long)p->addr,			\
+			   (unsigned long)p->addr +			\
+			   (MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe;
+
+/* Architecture specific copy of original instruction. */
+struct arch_specific_insn {
+	kprobe_opcode_t *insn;
+};
+
+struct prev_kprobe {
+	struct kprobe *kp;
+	unsigned long status;
+	unsigned long saved_pc;
+};
+
+#define MAX_JPROBES_STACK_SIZE 128
+#define MAX_JPROBES_STACK_ADDR \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE - 32 \
+		- sizeof(struct pt_regs))
+
+#define MIN_JPROBES_STACK_SIZE(ADDR)					\
+	((((ADDR) + MAX_JPROBES_STACK_SIZE) > MAX_JPROBES_STACK_ADDR)	\
+		? MAX_JPROBES_STACK_ADDR - (ADDR)			\
+		: MAX_JPROBES_STACK_SIZE)
+
+/* per-cpu kprobe control block. */
+struct kprobe_ctlblk {
+	unsigned long kprobe_status;
+	unsigned long kprobe_saved_pc;
+	unsigned long jprobe_saved_sp;
+	struct prev_kprobe prev_kprobe;
+	struct pt_regs jprobe_saved_regs;
+	char jprobes_stack[MAX_JPROBES_STACK_SIZE];
+};
+
+extern tile_bundle_bits breakpoint2_insn;
+extern tile_bundle_bits breakpoint_insn;
+
+void arch_remove_kprobe(struct kprobe *);
+
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+			     unsigned long val, void *data);
+
+#endif /* _ASM_TILE_KPROBES_H */
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index e2c7890..0cab118 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -22,6 +22,7 @@ struct mm_context {
 	 * semaphore but atomically, but it is conservatively set.
 	 */
 	unsigned long priority_cached;
+	unsigned long vdso_base;
 };
 
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 37f0b74..4734215 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -45,7 +45,7 @@ static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 
 static inline void install_page_table(pgd_t *pgdir, int asid)
 {
-	pte_t *ptep = virt_to_pte(NULL, (unsigned long)pgdir);
+	pte_t *ptep = virt_to_kpte((unsigned long)pgdir);
 	__install_page_table(pgdir, asid, *ptep);
 }
 
diff --git a/arch/tile/include/asm/mmzone.h b/arch/tile/include/asm/mmzone.h
index 9d3dbce..804f109 100644
--- a/arch/tile/include/asm/mmzone.h
+++ b/arch/tile/include/asm/mmzone.h
@@ -42,7 +42,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 
 #define kern_addr_valid(kaddr)	virt_addr_valid((void *)kaddr)
 
-static inline int pfn_valid(int pfn)
+static inline int pfn_valid(unsigned long pfn)
 {
 	int nid = pfn_to_nid(pfn);
 
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index dd033a4..6346888 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -39,6 +39,12 @@
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 
 /*
+ * We do define AT_SYSINFO_EHDR to support vDSO,
+ * but don't use the gate mechanism.
+ */
+#define __HAVE_ARCH_GATE_AREA		1
+
+/*
  * If the Kconfig doesn't specify, set a maximum zone order that
  * is enough so that we can create huge pages from small pages given
  * the respective sizes of the two page types.  See <linux/mmzone.h>.
@@ -142,8 +148,12 @@ static inline __attribute_const__ int get_order(unsigned long size)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
+/* Allow overriding how much VA or PA the kernel will use. */
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+
 /* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
 #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
 #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
 #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -154,7 +164,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * We reserve the lower half of memory for user-space programs, and the
  * upper half for system code.  We re-map all of physical memory in the
  * upper half, which takes a quarter of our VA space.  Then we have
- * the vmalloc regions.  The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions.  The supervisor code lives at the highest address,
  * with the hypervisor above that.
  *
  * Loadable kernel modules are placed immediately after the static
@@ -166,26 +176,19 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * Similarly, for now we don't play any struct page mapping games.
  */
 
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
 # error Too much PA to map with the VA available!
 #endif
-#define HALF_VA_SPACE           (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
 
-#define MEM_LOW_END		(HALF_VA_SPACE - 1)         /* low half */
-#define MEM_HIGH_START		(-HALF_VA_SPACE)            /* high half */
-#define PAGE_OFFSET		MEM_HIGH_START
-#define FIXADDR_BASE		_AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP		_AC(0xfffffff500000000, UL) /* 4 GB */
+#define PAGE_OFFSET		(-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR	_AC(0xfffffff800000000, UL)  /* high 32GB */
+#define FIXADDR_BASE		(KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP		(KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
 #define _VMALLOC_START		FIXADDR_TOP
-#define HUGE_VMAP_BASE		_AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START		_AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT		MEM_SV_START
-#define MEM_MODULE_START	_AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE		(KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START		(KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START	(MEM_SV_START + (256*1024*1024)) /* 256 MB */
 #define MEM_MODULE_END		(MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START		_AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR	MEM_SV_START
 
 #else /* !__tilegx__ */
 
@@ -207,25 +210,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * values, and after that, we show "typical" values, since the actual
  * addresses depend on kernel #defines.
  *
- * MEM_HV_INTRPT                   0xfe000000
- * MEM_SV_INTRPT (kernel code)     0xfd000000
+ * MEM_HV_START                    0xfe000000
+ * MEM_SV_START  (kernel code)     0xfd000000
  * MEM_USER_INTRPT (user vector)   0xfc000000
- * FIX_KMAP_xxx                    0xf8000000 (via NR_CPUS * KM_TYPE_NR)
- * PKMAP_BASE                      0xf7000000 (via LAST_PKMAP)
- * HUGE_VMAP                       0xf3000000 (via CONFIG_NR_HUGE_VMAPS)
- * VMALLOC_START                   0xf0000000 (via __VMALLOC_RESERVE)
+ * FIX_KMAP_xxx                    0xfa000000 (via NR_CPUS * KM_TYPE_NR)
+ * PKMAP_BASE                      0xf9000000 (via LAST_PKMAP)
+ * VMALLOC_START                   0xf7000000 (via VMALLOC_RESERVE)
  * mapped LOWMEM                   0xc0000000
  */
 
 #define MEM_USER_INTRPT		_AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT		_AC(0xfd000000, UL)
-#define MEM_HV_INTRPT		_AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT	_AC(0xfd000000, UL)
-#define MEM_SV_INTRPT		_AC(0xfe000000, UL)
-#define MEM_HV_INTRPT		_AC(0xff000000, UL)
-#endif
+#define MEM_SV_START		_AC(0xfd000000, UL)
+#define MEM_HV_START		_AC(0xfe000000, UL)
 
 #define INTRPT_SIZE		0x4000
 
@@ -246,7 +242,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #endif /* __tilegx__ */
 
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(VDSO_BUILD)
 
 #ifdef CONFIG_HIGHMEM
 
@@ -332,6 +328,7 @@ static inline int pfn_valid(unsigned long pfn)
 
 struct mm_struct;
 extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
+extern pte_t *virt_to_kpte(unsigned long kaddr);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 54a9242..dfedd7a 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -17,7 +17,6 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
-#include <linux/numa.h>
 #include <asm-generic/pci_iomap.h>
 
 #ifndef __tilegx__
@@ -29,7 +28,6 @@ struct pci_controller {
 	int index;		/* PCI domain number */
 	struct pci_bus *root_bus;
 
-	int first_busno;
 	int last_busno;
 
 	int hv_cfg_fd[2];	/* config{0,1} fds for this PCIe controller */
@@ -124,6 +122,11 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr) {}
  * the CPA plus TILE_PCI_MEM_MAP_BASE_OFFSET. To support 32-bit
  * devices, we create a separate map region that handles the low
  * 4GB.
+ *
+ * This design lets us avoid the "PCI hole" problem where the host bridge
+ * won't pass DMA traffic with target addresses that happen to fall within the
+ * BAR space. This enables us to use all the physical memory for DMA, instead
+ * of wasting the same amount of physical memory as the BAR window size.
  */
 #define	TILE_PCI_MEM_MAP_BASE_OFFSET	(1ULL << CHIP_PA_WIDTH())
 
@@ -145,6 +148,10 @@ struct pci_controller {
 
 	int pio_mem_index;	/* PIO region index for memory access */
 
+#ifdef CONFIG_TILE_PCI_IO
+	int pio_io_index;	/* PIO region index for I/O space access */
+#endif
+
 	/*
 	 * Mem-Map regions for all the memory controllers so that Linux can
 	 * map all of its physical memory space to the PCI bus.
@@ -154,6 +161,10 @@ struct pci_controller {
 	int index;		/* PCI domain number */
 	struct pci_bus *root_bus;
 
+	/* PCI I/O space resource for this controller. */
+	struct resource io_space;
+	char io_space_name[32];
+
 	/* PCI memory space resource for this controller. */
 	struct resource mem_space;
 	char mem_space_name[32];
@@ -166,13 +177,11 @@ struct pci_controller {
 
 	/* Table that maps the INTx numbers to Linux irq numbers. */
 	int irq_intx_table[4];
-
-	/* Address ranges that are routed to this controller/bridge. */
-	struct resource mem_resources[3];
 };
 
 extern struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
 extern gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
+extern int num_trio_shims;
 
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 
@@ -211,7 +220,8 @@ static inline int pcibios_assign_all_busses(void)
 }
 
 #define PCIBIOS_MIN_MEM		0
-#define PCIBIOS_MIN_IO		0
+/* Minimum PCI I/O address, starting at the page boundary. */
+#define PCIBIOS_MIN_IO		PAGE_SIZE
 
 /* Use any cpu for PCI. */
 #define cpumask_of_pcibus(bus) cpu_online_mask
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 4ce4a7a..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -84,10 +84,12 @@ extern unsigned long VMALLOC_RESERVE /* = CONFIG_VMALLOC_RESERVE */;
 /* We have no pmd or pud since we are strictly a two-level page table */
 #include <asm-generic/pgtable-nopmd.h>
 
+static inline int pud_huge_page(pud_t pud)	{ return 0; }
+
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_INTRPT;
+	return addr >= MEM_HV_START;
 }
 
 /*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 2492fa5..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -63,6 +63,15 @@
 /* We have no pud since we are a three-level page table. */
 #include <asm-generic/pgtable-nopud.h>
 
+/*
+ * pmds are the same as pgds and ptes, so converting is a no-op.
+ */
+#define pmd_pte(pmd) (pmd)
+#define pmdp_ptep(pmdp) (pmdp)
+#define pte_pmd(pte) (pte)
+
+#define pud_pte(pud) ((pud).pgd)
+
 static inline int pud_none(pud_t pud)
 {
 	return pud_val(pud) == 0;
@@ -73,6 +82,11 @@ static inline int pud_present(pud_t pud)
 	return pud_val(pud) & _PAGE_PRESENT;
 }
 
+static inline int pud_huge_page(pud_t pud)
+{
+	return pud_val(pud) & _PAGE_HUGE_PAGE;
+}
+
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd 0x%016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
@@ -89,6 +103,9 @@ static inline int pud_bad(pud_t pud)
 /* Return the page-table frame number (ptfn) that a pud_t points at. */
 #define pud_ptfn(pud) hv_pte_get_ptfn((pud).pgd)
 
+/* Return the page frame number (pfn) that a pud_t points at. */
+#define pud_pfn(pud) pte_pfn(pud_pte(pud))
+
 /*
  * A given kernel pud_t maps to a kernel pmd_t table at a specific
  * virtual address.  Since kernel pmd_t tables can be aligned at
@@ -123,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_START ||
-		(addr > MEM_LOW_END && addr < MEM_HIGH_START);
+	return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
 }
 
 /*
@@ -152,13 +168,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return hv_pte(__insn_exch(&ptep->val, 0UL));
 }
 
-/*
- * pmds are the same as pgds and ptes, so converting is a no-op.
- */
-#define pmd_pte(pmd) (pmd)
-#define pmdp_ptep(pmdp) (pmdp)
-#define pte_pmd(pte) (pte)
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_TILE_PGTABLE_64_H */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index b3f1049..4232363 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
 #ifndef _ASM_TILE_PROCESSOR_H
 #define _ASM_TILE_PROCESSOR_H
 
+#include <arch/chip.h>
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -25,7 +27,6 @@
 #include <asm/ptrace.h>
 #include <asm/percpu.h>
 
-#include <arch/chip.h>
 #include <arch/spr_def.h>
 
 struct task_struct;
@@ -110,18 +111,16 @@ struct thread_struct {
 	unsigned long long interrupt_mask;
 	/* User interrupt-control 0 state */
 	unsigned long intctrl_0;
-#if CHIP_HAS_PROC_STATUS_SPR()
+	/* Is this task currently doing a backtrace? */
+	bool in_backtrace;
 	/* Any other miscellaneous processor state bits */
 	unsigned long proc_status;
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	/* Interrupt base for PL0 interrupts */
 	unsigned long interrupt_vector_base;
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	/* Tile cache retry fifo high-water mark */
 	unsigned long tile_rtf_hwm;
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	/* Data stream prefetch control */
 	unsigned long dstream_pf;
@@ -134,21 +133,16 @@ struct thread_struct {
 	/* Async DMA TLB fault information */
 	struct async_tlb dma_async_tlb;
 #endif
-#if CHIP_HAS_SN_PROC()
-	/* Was static network processor when we were switched out? */
-	int sn_proc_running;
-	/* Async SNI TLB fault information */
-	struct async_tlb sn_async_tlb;
-#endif
 };
 
 #endif /* !__ASSEMBLY__ */
 
 /*
  * Start with "sp" this many bytes below the top of the kernel stack.
- * This preserves the invariant that a called function may write to *sp.
+ * This allows us to be cache-aware when handling the initial save
+ * of the pt_regs value to the stack.
  */
-#define STACK_TOP_DELTA 8
+#define STACK_TOP_DELTA 64
 
 /*
  * When entering the kernel via a fault, start with the top of the
@@ -164,7 +158,7 @@ struct thread_struct {
 #ifndef __ASSEMBLY__
 
 #ifdef __tilegx__
-#define TASK_SIZE_MAX		(MEM_LOW_END + 1)
+#define TASK_SIZE_MAX		(_AC(1, UL) << (MAX_VA_WIDTH - 1))
 #else
 #define TASK_SIZE_MAX		PAGE_OFFSET
 #endif
@@ -178,10 +172,10 @@ struct thread_struct {
 #define TASK_SIZE		TASK_SIZE_MAX
 #endif
 
-/* We provide a minimal "vdso" a la x86; just the sigreturn code for now. */
-#define VDSO_BASE		(TASK_SIZE - PAGE_SIZE)
+#define VDSO_BASE	((unsigned long)current->active_mm->context.vdso_base)
+#define VDSO_SYM(x)	(VDSO_BASE + (unsigned long)(x))
 
-#define STACK_TOP		VDSO_BASE
+#define STACK_TOP		TASK_SIZE
 
 /* STACK_TOP_MAX is used temporarily in execve and should not check COMPAT. */
 #define STACK_TOP_MAX		TASK_SIZE_MAX
@@ -232,21 +226,28 @@ extern int do_work_pending(struct pt_regs *regs, u32 flags);
 unsigned long get_wchan(struct task_struct *p);
 
 /* Return initial ksp value for given task. */
-#define task_ksp0(task) ((unsigned long)(task)->stack + THREAD_SIZE)
+#define task_ksp0(task) \
+	((unsigned long)(task)->stack + THREAD_SIZE - STACK_TOP_DELTA)
 
 /* Return some info about the user process TASK. */
-#define KSTK_TOP(task)	(task_ksp0(task) - STACK_TOP_DELTA)
 #define task_pt_regs(task) \
-  ((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
+	((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
 #define current_pt_regs()                                   \
-  ((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
-                      (KSTK_PTREGS_GAP - 1)) - 1)
+	((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
+			    STACK_TOP_DELTA - (KSTK_PTREGS_GAP - 1)) - 1)
 #define task_sp(task)	(task_pt_regs(task)->sp)
 #define task_pc(task)	(task_pt_regs(task)->pc)
 /* Aliases for pc and sp (used in fs/proc/array.c) */
 #define KSTK_EIP(task)	task_pc(task)
 #define KSTK_ESP(task)	task_sp(task)
 
+/* Fine-grained unaligned JIT support */
+#define GET_UNALIGN_CTL(tsk, adr)	get_unalign_ctl((tsk), (adr))
+#define SET_UNALIGN_CTL(tsk, val)	set_unalign_ctl((tsk), (val))
+
+extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
+extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
+
 /* Standard format for printing registers and other word-size data. */
 #ifdef __tilegx__
 # define REGFMT "0x%016lx"
@@ -275,7 +276,6 @@ extern char chip_model[64];
 /* Data on which physical memory controller corresponds to which NUMA node. */
 extern int node_controller[];
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Does the heap allocator return hash-for-home pages by default? */
 extern int hash_default;
 
@@ -285,11 +285,6 @@ extern int kstack_hash;
 /* Does MAP_ANONYMOUS return hash-for-home pages by default? */
 #define uheap_hash hash_default
 
-#else
-#define hash_default 0
-#define kstack_hash 0
-#define uheap_hash 0
-#endif
 
 /* Are we using huge pages in the TLB for kernel data? */
 extern int kdata_huge;
@@ -337,7 +332,6 @@ extern int kdata_huge;
 
 /*
  * Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
  */
 #define USER_PL 0
 #if CONFIG_KERNEL_PL == 2
@@ -346,20 +340,38 @@ extern int kdata_huge;
 #define KERNEL_PL CONFIG_KERNEL_PL
 
 /* SYSTEM_SAVE_K_0 holds the current cpu number ORed with ksp0. */
-#define CPU_LOG_MASK_VALUE 12
-#define CPU_MASK_VALUE ((1 << CPU_LOG_MASK_VALUE) - 1)
-#if CONFIG_NR_CPUS > CPU_MASK_VALUE
-# error Too many cpus!
+#ifdef __tilegx__
+#define CPU_SHIFT 48
+#if CHIP_VA_WIDTH() > CPU_SHIFT
+# error Too many VA bits!
 #endif
+#define MAX_CPU_ID ((1 << (64 - CPU_SHIFT)) - 1)
+#define raw_smp_processor_id() \
+	((int)(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) >> CPU_SHIFT))
+#define get_current_ksp0() \
+	((unsigned long)(((long)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) << \
+			  (64 - CPU_SHIFT)) >> (64 - CPU_SHIFT)))
+#define next_current_ksp0(task) ({ \
+	unsigned long __ksp0 = task_ksp0(task) & ((1UL << CPU_SHIFT) - 1); \
+	unsigned long __cpu = (long)raw_smp_processor_id() << CPU_SHIFT; \
+	__ksp0 | __cpu; \
+})
+#else
+#define LOG2_NR_CPU_IDS 6
+#define MAX_CPU_ID ((1 << LOG2_NR_CPU_IDS) - 1)
 #define raw_smp_processor_id() \
-	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & CPU_MASK_VALUE)
+	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & MAX_CPU_ID)
 #define get_current_ksp0() \
-	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~CPU_MASK_VALUE)
+	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~MAX_CPU_ID)
 #define next_current_ksp0(task) ({ \
 	unsigned long __ksp0 = task_ksp0(task); \
 	int __cpu = raw_smp_processor_id(); \
-	BUG_ON(__ksp0 & CPU_MASK_VALUE); \
+	BUG_ON(__ksp0 & MAX_CPU_ID); \
 	__ksp0 | __cpu; \
 })
+#endif
+#if CONFIG_NR_CPUS > (MAX_CPU_ID + 1)
+# error Too many cpus!
+#endif
 
 #endif /* _ASM_TILE_PROCESSOR_H */
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index fd41226..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -33,12 +33,13 @@ typedef unsigned long pt_reg_t;
 
 #ifndef __ASSEMBLY__
 
+#define regs_return_value(regs) ((regs)->regs[0])
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
 #define user_stack_pointer(regs) ((regs)->sp)
 
 /* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
 
 /* Fill in a struct pt_regs with the current kernel registers. */
 struct pt_regs *get_pt_regs(struct pt_regs *);
@@ -79,8 +80,7 @@ extern void single_step_execve(void);
 
 struct task_struct;
 
-extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
-			 int error_code);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs);
 
 #ifdef __tilegx__
 /* We need this since sigval_t has a user pointer in it, for GETSIGINFO etc. */
diff --git a/arch/tile/include/asm/sections.h b/arch/tile/include/asm/sections.h
index 7d8a935..5d5d3b7 100644
--- a/arch/tile/include/asm/sections.h
+++ b/arch/tile/include/asm/sections.h
@@ -25,10 +25,16 @@ extern char _sinitdata[], _einitdata[];
 /* Write-once data is writable only till the end of initialization. */
 extern char __w1data_begin[], __w1data_end[];
 
+extern char vdso_start[], vdso_end[];
+#ifdef CONFIG_COMPAT
+extern char vdso32_start[], vdso32_end[];
+#endif
 
 /* Not exactly sections, but PC comparison points in the code. */
 extern char __rt_sigreturn[], __rt_sigreturn_end[];
-#ifndef __tilegx__
+#ifdef __tilegx__
+extern char __start_unalign_asm_code[], __end_unalign_asm_code[];
+#else
 extern char sys_cmpxchg[], __sys_cmpxchg_end[];
 extern char __sys_cmpxchg_grab_lock[];
 extern char __start_atomic_asm_code[], __end_atomic_asm_code[];
diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
index d048888..e989090 100644
--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -24,9 +24,8 @@
  */
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
+int tile_console_write(const char *buf, int count);
 void early_panic(const char *fmt, ...);
-void warn_early_printk(void);
-void __init disable_early_printk(void);
 
 /* Init-time routine to do tile-specific per-cpu setup. */
 void setup_cpu(int boot);
diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h
index 1aa759a..9a326b6 100644
--- a/arch/tile/include/asm/smp.h
+++ b/arch/tile/include/asm/smp.h
@@ -101,10 +101,8 @@ void print_disabled_cpus(void);
 extern struct cpumask cpu_lotar_map;
 #define cpu_is_valid_lotar(cpu) cpumask_test_cpu((cpu), &cpu_lotar_map)
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Which processors are used for hash-for-home mapping */
 extern struct cpumask hash_for_home_map;
-#endif
 
 /* Which cpus can have their cache flushed by hv_flush_remote(). */
 extern struct cpumask cpu_cacheable_map;
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index 5f8b6a0..9a12b9c 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -27,7 +27,7 @@
  * Return the "current" portion of a ticket lock value,
  * i.e. the number that currently owns the lock.
  */
-static inline int arch_spin_current(u32 val)
+static inline u32 arch_spin_current(u32 val)
 {
 	return val >> __ARCH_SPIN_CURRENT_SHIFT;
 }
@@ -36,7 +36,7 @@ static inline int arch_spin_current(u32 val)
  * Return the "next" portion of a ticket lock value,
  * i.e. the number that the next task to try to acquire the lock will get.
  */
-static inline int arch_spin_next(u32 val)
+static inline u32 arch_spin_next(u32 val)
 {
 	return val & __ARCH_SPIN_NEXT_MASK;
 }
diff --git a/arch/tile/include/asm/string.h b/arch/tile/include/asm/string.h
index 7535cf1..92b271b 100644
--- a/arch/tile/include/asm/string.h
+++ b/arch/tile/include/asm/string.h
@@ -21,8 +21,10 @@
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_STRCHR
 #define __HAVE_ARCH_STRLEN
+#define __HAVE_ARCH_STRNLEN
 
 extern __kernel_size_t strlen(const char *);
+extern __kernel_size_t strnlen(const char *, __kernel_size_t);
 extern char *strchr(const char *s, int c);
 extern void *memchr(const void *s, int c, size_t n);
 extern void *memset(void *, int, __kernel_size_t);
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index d1733de..b8aa6df 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -39,6 +39,11 @@ struct thread_info {
 	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
+	int			align_ctl;	/* controls unaligned access */
+#ifdef __tilegx__
+	unsigned long		unalign_jit_tmp[4]; /* temp r0..r3 storage */
+	void __user		*unalign_jit_base; /* unalign fixup JIT base */
+#endif
 };
 
 /*
@@ -56,6 +61,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 	.step_state	= NULL,			\
+	.align_ctl	= 0,			\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/tile/include/asm/traps.h b/arch/tile/include/asm/traps.h
index e28c3df4..4b99a1c 100644
--- a/arch/tile/include/asm/traps.h
+++ b/arch/tile/include/asm/traps.h
@@ -15,12 +15,13 @@
 #ifndef _ASM_TILE_TRAPS_H
 #define _ASM_TILE_TRAPS_H
 
+#ifndef __ASSEMBLY__
 #include <arch/chip.h>
 
 /* mm/fault.c */
 void do_page_fault(struct pt_regs *, int fault_num,
 		   unsigned long address, unsigned long write);
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 void do_async_page_fault(struct pt_regs *);
 #endif
 
@@ -69,6 +70,16 @@ void gx_singlestep_handle(struct pt_regs *, int fault_num);
 
 /* kernel/intvec_64.S */
 void fill_ra_stack(void);
+
+/* Handle unalign data fixup. */
+extern void do_unaligned(struct pt_regs *regs, int vecnum);
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __tilegx__
+/* 128 byte JIT per unalign fixup. */
+#define UNALIGN_JIT_SHIFT    7
 #endif
 
 #endif /* _ASM_TILE_TRAPS_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index e4d44bd..b6cde32 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -127,8 +127,10 @@ extern int fixup_exception(struct pt_regs *regs);
 
 #ifdef __LP64__
 #define _ASM_PTR	".quad"
+#define _ASM_ALIGN	".align 8"
 #else
 #define _ASM_PTR	".long"
+#define _ASM_ALIGN	".align 4"
 #endif
 
 #define __get_user_asm(OP, x, ptr, ret)					\
@@ -137,6 +139,7 @@ extern int fixup_exception(struct pt_regs *regs);
 		     "0: { movei %1, 0; movei %0, %3 }\n"		\
 		     "j 9f\n"						\
 		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
 		     _ASM_PTR " 1b, 0b\n"				\
 		     ".popsection\n"					\
 		     "9:"						\
@@ -168,6 +171,7 @@ extern int fixup_exception(struct pt_regs *regs);
 			     "0: { movei %1, 0; movei %2, 0 }\n"	\
 			     "{ movei %0, %4; j 9f }\n"			\
 			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
 			     ".word 1b, 0b\n"				\
 			     ".word 2b, 0b\n"				\
 			     ".popsection\n"				\
@@ -224,6 +228,7 @@ extern int __get_user_bad(void)
 		     ".pushsection .fixup,\"ax\"\n"			\
 		     "0: { movei %0, %3; j 9f }\n"			\
 		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
 		     _ASM_PTR " 1b, 0b\n"				\
 		     ".popsection\n"					\
 		     "9:"						\
@@ -248,6 +253,7 @@ extern int __get_user_bad(void)
 			     ".pushsection .fixup,\"ax\"\n"		\
 			     "0: { movei %0, %4; j 9f }\n"		\
 			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
 			     ".word 1b, 0b\n"				\
 			     ".word 2b, 0b\n"				\
 			     ".popsection\n"				\
@@ -567,37 +573,6 @@ static inline unsigned long __must_check flush_user(
 }
 
 /**
- * inv_user: - Invalidate a block of memory in user space from cache.
- * @mem:   Destination address, in user space.
- * @len:   Number of bytes to invalidate.
- *
- * Returns number of bytes that could not be invalidated.
- * On success, this will be zero.
- *
- * Note that on Tile64, the "inv" operation is in fact a
- * "flush and invalidate", so cache write-backs will occur prior
- * to the cache being marked invalid.
- */
-extern unsigned long inv_user_asm(void __user *mem, unsigned long len);
-static inline unsigned long __must_check __inv_user(
-	void __user *mem, unsigned long len)
-{
-	int retval;
-
-	might_fault();
-	retval = inv_user_asm(mem, len);
-	mb_incoherent();
-	return retval;
-}
-static inline unsigned long __must_check inv_user(
-	void __user *mem, unsigned long len)
-{
-	if (access_ok(VERIFY_WRITE, mem, len))
-		return __inv_user(mem, len);
-	return len;
-}
-
-/**
  * finv_user: - Flush-inval a block of memory in user space from cache.
  * @mem:   Destination address, in user space.
  * @len:   Number of bytes to invalidate.
diff --git a/arch/tile/include/asm/unaligned.h b/arch/tile/include/asm/unaligned.h
index 37dfbe5..5a58a0d 100644
--- a/arch/tile/include/asm/unaligned.h
+++ b/arch/tile/include/asm/unaligned.h
@@ -15,11 +15,15 @@
 #ifndef _ASM_TILE_UNALIGNED_H
 #define _ASM_TILE_UNALIGNED_H
 
-#include <linux/unaligned/le_struct.h>
-#include <linux/unaligned/be_byteshift.h>
-#include <linux/unaligned/generic.h>
-#define get_unaligned	__get_unaligned_le
-#define put_unaligned	__put_unaligned_le
+/*
+ * We could implement faster get_unaligned_[be/le]64 using the ldna
+ * instruction on tilegx; however, we need to either copy all of the
+ * other generic functions to here (which is pretty ugly) or else
+ * modify both the generic code and other arch code to allow arch
+ * specific unaligned data access functions.  Given these functions
+ * are not often called, we'll stick with the generic version.
+ */
+#include <asm-generic/unaligned.h>
 
 /*
  * Is the kernel doing fixups of unaligned accesses?  If <0, no kernel
diff --git a/arch/tile/include/asm/vdso.h b/arch/tile/include/asm/vdso.h
new file mode 100644
index 0000000..9f6a78d
--- /dev/null
+++ b/arch/tile/include/asm/vdso.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __TILE_VDSO_H__
+#define __TILE_VDSO_H__
+
+#include <linux/types.h>
+
+/*
+ * Note about the vdso_data structure:
+ *
+ * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
+ * structure is supposed to be known only to the function in the vdso
+ * itself and may change without notice.
+ */
+
+struct vdso_data {
+	__u64 tz_update_count;  /* Timezone atomicity ctr             */
+	__u64 tb_update_count;  /* Timebase atomicity ctr             */
+	__u64 xtime_tod_stamp;  /* TOD clock for xtime                */
+	__u64 xtime_clock_sec;  /* Kernel time second                 */
+	__u64 xtime_clock_nsec; /* Kernel time nanosecond             */
+	__u64 wtom_clock_sec;   /* Wall to monotonic clock second     */
+	__u64 wtom_clock_nsec;  /* Wall to monotonic clock nanosecond */
+	__u32 mult;             /* Cycle to nanosecond multiplier     */
+	__u32 shift;            /* Cycle to nanosecond divisor (power of two) */
+	__u32 tz_minuteswest;   /* Minutes west of Greenwich          */
+	__u32 tz_dsttime;       /* Type of dst correction             */
+};
+
+extern struct vdso_data *vdso_data;
+
+/* __vdso_rt_sigreturn is defined with the addresses in the vdso page. */
+extern void __vdso_rt_sigreturn(void);
+
+extern int setup_vdso_pages(void);
+
+#endif /* __TILE_VDSO_H__ */
diff --git a/arch/tile/include/gxio/iorpc_trio.h b/arch/tile/include/gxio/iorpc_trio.h
index 58105c3..d95b96f 100644
--- a/arch/tile/include/gxio/iorpc_trio.h
+++ b/arch/tile/include/gxio/iorpc_trio.h
@@ -30,6 +30,7 @@
 
 #define GXIO_TRIO_OP_ALLOC_MEMORY_MAPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1404)
 
+#define GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES IORPC_OPCODE(IORPC_FORMAT_NONE, 0x140e)
 #define GXIO_TRIO_OP_ALLOC_PIO_REGIONS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1412)
 
 #define GXIO_TRIO_OP_INIT_PIO_REGION_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1414)
@@ -54,6 +55,10 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,
 				unsigned int flags);
 
 
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
 int gxio_trio_alloc_pio_regions(gxio_trio_context_t * context,
 				unsigned int count, unsigned int first,
 				unsigned int flags);
diff --git a/arch/tile/include/gxio/iorpc_uart.h b/arch/tile/include/gxio/iorpc_uart.h
new file mode 100644
index 0000000..55429d4
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_uart.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_UART_LINUX_RPC_H__
+#define __GXIO_UART_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_uart_intf.h>
+#include <gxio/uart.h>
+#include <gxio/kiorpc.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_UART_OP_CFG_INTERRUPT     IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1900)
+#define GXIO_UART_OP_GET_MMIO_BASE     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_UART_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event);
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base);
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_UART_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/uart.h b/arch/tile/include/gxio/uart.h
new file mode 100644
index 0000000..438ee7e
--- /dev/null
+++ b/arch/tile/include/gxio/uart.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_UART_H_
+#define _GXIO_UART_H_
+
+#include "common.h"
+
+#include <hv/drv_uart_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * An API for manipulating UART interface.
+ */
+
+/*
+ *
+ * The Rshim allows access to the processor's UART interface.
+ */
+
+/* A context object used to manage UART resources. */
+typedef struct {
+
+	/* File descriptor for calling up to the hypervisor. */
+	int fd;
+
+	/* The VA at which our MMIO registers are mapped. */
+	char *mmio_base;
+
+} gxio_uart_context_t;
+
+/* Request UART interrupts.
+ *
+ *  Request that interrupts be delivered to a tile when the UART's
+ *  Receive FIFO is written, or the Write FIFO is read.
+ *
+ * @param context Pointer to a properly initialized gxio_uart_context_t.
+ * @param bind_cpu_x X coordinate of CPU to which interrupt will be delivered.
+ * @param bind_cpu_y Y coordinate of CPU to which interrupt will be delivered.
+ * @param bind_interrupt IPI interrupt number.
+ * @param bind_event Sub-interrupt event bit number; a negative value can
+ *  disable the interrupt.
+ * @return Zero if all of the requested UART events were successfully
+ *  configured to interrupt.
+ */
+extern int gxio_uart_cfg_interrupt(gxio_uart_context_t *context,
+				   int bind_cpu_x,
+				   int bind_cpu_y,
+				   int bind_interrupt, int bind_event);
+
+/* Initialize a UART context.
+ *
+ *  A properly initialized context must be obtained before any of the other
+ *  gxio_uart routines may be used.
+ *
+ * @param context Pointer to a gxio_uart_context_t, which will be initialized
+ *  by this routine, if it succeeds.
+ * @param uart_index Index of the UART to use.
+ * @return Zero if the context was successfully initialized, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_init(gxio_uart_context_t *context, int uart_index);
+
+/* Destroy a UART context.
+ *
+ *  Once destroyed, a context may not be used with any gxio_uart routines
+ *  other than gxio_uart_init().  After this routine returns, no further
+ *  interrupts requested on this context will be delivered.  The state and
+ *  configuration of the pins which had been attached to this context are
+ *  unchanged by this operation.
+ *
+ * @param context Pointer to a gxio_uart_context_t.
+ * @return Zero if the context was successfully destroyed, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_destroy(gxio_uart_context_t *context);
+
+/* Write UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @param word Data will be wrote to UART reigister.
+ */
+extern void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+			    uint64_t word);
+
+/* Read UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @return Data read from UART register.
+ */
+extern uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset);
+
+#endif /* _GXIO_UART_H_ */
diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
index ef9f3f5..237e04d 100644
--- a/arch/tile/include/hv/drv_trio_intf.h
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -64,8 +64,9 @@ struct pcie_port_property
    *  will not consider it an error if the link comes up as a x8 link. */
   uint8_t allow_x8: 1;
 
-  /** Reserved. */
-  uint8_t reserved: 1;
+  /** If true, this link is connected to a device which may or may not
+   *  be present. */
+  uint8_t removable: 1;
 
 };
 
@@ -167,6 +168,9 @@ pcie_stream_intr_config_sel_t;
 struct pcie_trio_ports_property
 {
   struct pcie_port_property ports[TILEGX_TRIO_PCIES];
+
+  /** Set if this TRIO belongs to a Gx72 device. */
+  uint8_t is_gx72;
 };
 
 /* Flags indicating traffic class. */
diff --git a/arch/tile/include/hv/drv_uart_intf.h b/arch/tile/include/hv/drv_uart_intf.h
new file mode 100644
index 0000000..f5379e2
--- /dev/null
+++ b/arch/tile/include/hv/drv_uart_intf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the UART driver.
+ */
+
+#ifndef _SYS_HV_DRV_UART_INTF_H
+#define _SYS_HV_DRV_UART_INTF_H
+
+#include <arch/uart.h>
+
+/** Number of UART ports supported. */
+#define TILEGX_UART_NR        2
+
+/** The mmap file offset (PA) of the UART MMIO region. */
+#define HV_UART_MMIO_OFFSET   0
+
+/** The maximum size of the UARTs MMIO region (64K Bytes). */
+#define HV_UART_MMIO_SIZE     (1UL << 16)
+
+#endif /* _SYS_HV_DRV_UART_INTF_H */
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index 837dca5..dfcdeb6 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -318,8 +318,11 @@
 /** hv_set_pte_super_shift */
 #define HV_DISPATCH_SET_PTE_SUPER_SHIFT           57
 
+/** hv_console_set_ipi */
+#define HV_DISPATCH_CONSOLE_SET_IPI               63
+
 /** One more than the largest dispatch value */
-#define _HV_DISPATCH_END                          58
+#define _HV_DISPATCH_END                          64
 
 
 #ifndef __ASSEMBLER__
@@ -541,14 +544,24 @@ typedef enum {
   HV_CONFSTR_CPUMOD_REV      = 18,
 
   /** Human-readable CPU module description. */
-  HV_CONFSTR_CPUMOD_DESC     = 19
+  HV_CONFSTR_CPUMOD_DESC     = 19,
+
+  /** Per-tile hypervisor statistics.  When this identifier is specified,
+   *  the hv_confstr call takes two extra arguments.  The first is the
+   *  HV_XY_TO_LOTAR of the target tile's coordinates.  The second is
+   *  a flag word.  The only current flag is the lowest bit, which means
+   *  "zero out the stats instead of retrieving them"; in this case the
+   *  buffer and buffer length are ignored. */
+  HV_CONFSTR_HV_STATS        = 20
 
 } HV_ConfstrQuery;
 
 /** Query a configuration string from the hypervisor.
  *
  * @param query Identifier for the specific string to be retrieved
- *        (HV_CONFSTR_xxx).
+ *        (HV_CONFSTR_xxx).  Some strings may require or permit extra
+ *        arguments to be appended which select specific objects to be
+ *        described; see the string descriptions above.
  * @param buf Buffer in which to place the string.
  * @param len Length of the buffer.
  * @return If query is valid, then the length of the corresponding string,
@@ -556,21 +569,16 @@ typedef enum {
  *        was truncated.  If query is invalid, HV_EINVAL.  If the specified
  *        buffer is not writable by the client, HV_EFAULT.
  */
-int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len);
+int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len, ...);
 
 /** Tile coordinate */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   /** X coordinate, relative to supervisor's top-left coordinate */
   int x;
 
   /** Y coordinate, relative to supervisor's top-left coordinate */
   int y;
-#else
-  int y;
-  int x;
-#endif
 } HV_Coord;
 
 
@@ -585,6 +593,30 @@ typedef struct
  */
 int hv_get_ipi_pte(HV_Coord tile, int pl, HV_PTE* pte);
 
+/** Configure the console interrupt.
+ *
+ * When the console client interrupt is enabled, the hypervisor will
+ * deliver the specified IPI to the client in the following situations:
+ *
+ * - The console has at least one character available for input.
+ *
+ * - The console can accept new characters for output, and the last call
+ *   to hv_console_write() did not write all of the characters requested
+ *   by the client.
+ *
+ * Note that in some system configurations, console interrupt will not
+ * be available; clients should be prepared for this routine to fail and
+ * to fall back to periodic console polling in that case.
+ *
+ * @param ipi Index of the IPI register which will receive the interrupt.
+ * @param event IPI event number for console interrupt. If less than 0,
+ *        disable the console IPI interrupt.
+ * @param coord Tile to be targeted for console interrupt.
+ * @return 0 on success, otherwise, HV_EINVAL if illegal parameter,
+ *         HV_ENOTSUP if console interrupt are not available.
+ */
+int hv_console_set_ipi(int ipi, int event, HV_Coord coord);
+
 #else /* !CHIP_HAS_IPI() */
 
 /** A set of interrupts. */
@@ -1092,13 +1124,8 @@ HV_VirtAddrRange hv_inquire_virtual(int idx);
 /** A range of ASID values. */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   HV_ASID start;        /**< First ASID in the range. */
   unsigned int size;    /**< Number of ASIDs. Zero for an invalid range. */
-#else
-  unsigned int size;    /**< Number of ASIDs. Zero for an invalid range. */
-  HV_ASID start;        /**< First ASID in the range. */
-#endif
 } HV_ASIDRange;
 
 /** Returns information about a range of ASIDs.
@@ -1422,7 +1449,6 @@ typedef enum
 /** Message recipient. */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   /** X coordinate, relative to supervisor's top-left coordinate */
   unsigned int x:11;
 
@@ -1431,11 +1457,6 @@ typedef struct
 
   /** Status of this recipient */
   HV_Recip_State state:10;
-#else //__BIG_ENDIAN__
-  HV_Recip_State state:10;
-  unsigned int y:11;
-  unsigned int x:11;
-#endif
 } HV_Recipient;
 
 /** Send a message to a set of recipients.
diff --git a/arch/tile/include/uapi/arch/Kbuild b/arch/tile/include/uapi/arch/Kbuild
index 4ebc34f..97dfbec 100644
--- a/arch/tile/include/uapi/arch/Kbuild
+++ b/arch/tile/include/uapi/arch/Kbuild
@@ -1,7 +1,6 @@
 # UAPI Header export list
 header-y += abi.h
 header-y += chip.h
-header-y += chip_tile64.h
 header-y += chip_tilegx.h
 header-y += chip_tilepro.h
 header-y += icache.h
diff --git a/arch/tile/include/uapi/arch/chip.h b/arch/tile/include/uapi/arch/chip.h
index 926d3db..4c91f90 100644
--- a/arch/tile/include/uapi/arch/chip.h
+++ b/arch/tile/include/uapi/arch/chip.h
@@ -12,9 +12,7 @@
  *   more details.
  */
 
-#if __tile_chip__ == 0
-#include <arch/chip_tile64.h>
-#elif __tile_chip__ == 1
+#if __tile_chip__ == 1
 #include <arch/chip_tilepro.h>
 #elif defined(__tilegx__)
 #include <arch/chip_tilegx.h>
diff --git a/arch/tile/include/uapi/arch/chip_tile64.h b/arch/tile/include/uapi/arch/chip_tile64.h
deleted file mode 100644
index 261aaba..0000000
--- a/arch/tile/include/uapi/arch/chip_tile64.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-/*
- * @file
- * Global header file.
- * This header file specifies defines for TILE64.
- */
-
-#ifndef __ARCH_CHIP_H__
-#define __ARCH_CHIP_H__
-
-/** Specify chip version.
- * When possible, prefer the CHIP_xxx symbols below for future-proofing.
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip__ symbol.
- */
-#define TILE_CHIP 0
-
-/** Specify chip revision.
- * This provides for the case of a respin of a particular chip type;
- * the normal value for this symbol is "0".
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip_rev__ symbol.
- */
-#define TILE_CHIP_REV 0
-
-/** The name of this architecture. */
-#define CHIP_ARCH_NAME "tile64"
-
-/** The ELF e_machine type for binaries for this chip. */
-#define CHIP_ELF_TYPE() EM_TILE64
-
-/** The alternate ELF e_machine type for binaries for this chip. */
-#define CHIP_COMPAT_ELF_TYPE() 0x2506
-
-/** What is the native word size of the machine? */
-#define CHIP_WORD_SIZE() 32
-
-/** How many bits of a virtual address are used. Extra bits must be
- * the sign extension of the low bits.
- */
-#define CHIP_VA_WIDTH() 32
-
-/** How many bits are in a physical address? */
-#define CHIP_PA_WIDTH() 36
-
-/** Size of the L2 cache, in bytes. */
-#define CHIP_L2_CACHE_SIZE() 65536
-
-/** Log size of an L2 cache line in bytes. */
-#define CHIP_L2_LOG_LINE_SIZE() 6
-
-/** Size of an L2 cache line, in bytes. */
-#define CHIP_L2_LINE_SIZE() (1 << CHIP_L2_LOG_LINE_SIZE())
-
-/** Associativity of the L2 cache. */
-#define CHIP_L2_ASSOC() 2
-
-/** Size of the L1 data cache, in bytes. */
-#define CHIP_L1D_CACHE_SIZE() 8192
-
-/** Log size of an L1 data cache line in bytes. */
-#define CHIP_L1D_LOG_LINE_SIZE() 4
-
-/** Size of an L1 data cache line, in bytes. */
-#define CHIP_L1D_LINE_SIZE() (1 << CHIP_L1D_LOG_LINE_SIZE())
-
-/** Associativity of the L1 data cache. */
-#define CHIP_L1D_ASSOC() 2
-
-/** Size of the L1 instruction cache, in bytes. */
-#define CHIP_L1I_CACHE_SIZE() 8192
-
-/** Log size of an L1 instruction cache line in bytes. */
-#define CHIP_L1I_LOG_LINE_SIZE() 6
-
-/** Size of an L1 instruction cache line, in bytes. */
-#define CHIP_L1I_LINE_SIZE() (1 << CHIP_L1I_LOG_LINE_SIZE())
-
-/** Associativity of the L1 instruction cache. */
-#define CHIP_L1I_ASSOC() 1
-
-/** Stride with which flush instructions must be issued. */
-#define CHIP_FLUSH_STRIDE() CHIP_L2_LINE_SIZE()
-
-/** Stride with which inv instructions must be issued. */
-#define CHIP_INV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Stride with which finv instructions must be issued. */
-#define CHIP_FINV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Can the local cache coherently cache data that is homed elsewhere? */
-#define CHIP_HAS_COHERENT_LOCAL_CACHE() 0
-
-/** How many simultaneous outstanding victims can the L2 cache have? */
-#define CHIP_MAX_OUTSTANDING_VICTIMS() 2
-
-/** Does the TLB support the NC and NOALLOC bits? */
-#define CHIP_HAS_NC_AND_NOALLOC_BITS() 0
-
-/** Does the chip support hash-for-home caching? */
-#define CHIP_HAS_CBOX_HOME_MAP() 0
-
-/** Number of entries in the chip's home map tables. */
-/* #define CHIP_CBOX_HOME_MAP_SIZE() -- does not apply to chip 0 */
-
-/** Do uncacheable requests miss in the cache regardless of whether
- * there is matching data? */
-#define CHIP_HAS_ENFORCED_UNCACHEABLE_REQUESTS() 0
-
-/** Does the mf instruction wait for victims? */
-#define CHIP_HAS_MF_WAITS_FOR_VICTIMS() 1
-
-/** Does the chip have an "inv" instruction that doesn't also flush? */
-#define CHIP_HAS_INV() 0
-
-/** Does the chip have a "wh64" instruction? */
-#define CHIP_HAS_WH64() 0
-
-/** Does this chip have a 'dword_align' instruction? */
-#define CHIP_HAS_DWORD_ALIGN() 0
-
-/** Number of performance counters. */
-#define CHIP_PERFORMANCE_COUNTERS() 2
-
-/** Does this chip have auxiliary performance counters? */
-#define CHIP_HAS_AUX_PERF_COUNTERS() 0
-
-/** Is the CBOX_MSR1 SPR supported? */
-#define CHIP_HAS_CBOX_MSR1() 0
-
-/** Is the TILE_RTF_HWM SPR supported? */
-#define CHIP_HAS_TILE_RTF_HWM() 0
-
-/** Is the TILE_WRITE_PENDING SPR supported? */
-#define CHIP_HAS_TILE_WRITE_PENDING() 0
-
-/** Is the PROC_STATUS SPR supported? */
-#define CHIP_HAS_PROC_STATUS_SPR() 0
-
-/** Is the DSTREAM_PF SPR supported? */
-#define CHIP_HAS_DSTREAM_PF() 0
-
-/** Log of the number of mshims we have. */
-#define CHIP_LOG_NUM_MSHIMS() 2
-
-/** Are the bases of the interrupt vector areas fixed? */
-#define CHIP_HAS_FIXED_INTVEC_BASE() 1
-
-/** Are the interrupt masks split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_INTR_MASK() 1
-
-/** Is the cycle count split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_CYCLE() 1
-
-/** Does the chip have a static network? */
-#define CHIP_HAS_SN() 1
-
-/** Does the chip have a static network processor? */
-#define CHIP_HAS_SN_PROC() 1
-
-/** Size of the L1 static network processor instruction cache, in bytes. */
-#define CHIP_L1SNI_CACHE_SIZE() 2048
-
-/** Does the chip have DMA support in each tile? */
-#define CHIP_HAS_TILE_DMA() 1
-
-/** Does the chip have the second revision of the directly accessible
- *  dynamic networks?  This encapsulates a number of characteristics,
- *  including the absence of the catch-all, the absence of inline message
- *  tags, the absence of support for network context-switching, and so on.
- */
-#define CHIP_HAS_REV1_XDN() 0
-
-/** Does the chip have cmpexch and similar (fetchadd, exch, etc.)? */
-#define CHIP_HAS_CMPEXCH() 0
-
-/** Does the chip have memory-mapped I/O support? */
-#define CHIP_HAS_MMIO() 0
-
-/** Does the chip have post-completion interrupts? */
-#define CHIP_HAS_POST_COMPLETION_INTERRUPTS() 0
-
-/** Does the chip have native single step support? */
-#define CHIP_HAS_SINGLE_STEP() 0
-
-#ifndef __OPEN_SOURCE__  /* features only relevant to hypervisor-level code */
-
-/** How many entries are present in the instruction TLB? */
-#define CHIP_ITLB_ENTRIES() 8
-
-/** How many entries are present in the data TLB? */
-#define CHIP_DTLB_ENTRIES() 16
-
-/** How many MAF entries does the XAUI shim have? */
-#define CHIP_XAUI_MAF_ENTRIES() 16
-
-/** Does the memory shim have a source-id table? */
-#define CHIP_HAS_MSHIM_SRCID_TABLE() 1
-
-/** Does the L1 instruction cache clear on reset? */
-#define CHIP_HAS_L1I_CLEAR_ON_RESET() 0
-
-/** Does the chip come out of reset with valid coordinates on all tiles?
- * Note that if defined, this also implies that the upper left is 1,1.
- */
-#define CHIP_HAS_VALID_TILE_COORD_RESET() 0
-
-/** Does the chip have unified packet formats? */
-#define CHIP_HAS_UNIFIED_PACKET_FORMATS() 0
-
-/** Does the chip support write reordering? */
-#define CHIP_HAS_WRITE_REORDERING() 0
-
-/** Does the chip support Y-X routing as well as X-Y? */
-#define CHIP_HAS_Y_X_ROUTING() 0
-
-/** Is INTCTRL_3 managed with the correct MPL? */
-#define CHIP_HAS_INTCTRL_3_STATUS_FIX() 0
-
-/** Is it possible to configure the chip to be big-endian? */
-#define CHIP_HAS_BIG_ENDIAN_CONFIG() 0
-
-/** Is the CACHE_RED_WAY_OVERRIDDEN SPR supported? */
-#define CHIP_HAS_CACHE_RED_WAY_OVERRIDDEN() 0
-
-/** Is the DIAG_TRACE_WAY SPR supported? */
-#define CHIP_HAS_DIAG_TRACE_WAY() 0
-
-/** Is the MEM_STRIPE_CONFIG SPR supported? */
-#define CHIP_HAS_MEM_STRIPE_CONFIG() 0
-
-/** Are the TLB_PERF SPRs supported? */
-#define CHIP_HAS_TLB_PERF() 0
-
-/** Is the VDN_SNOOP_SHIM_CTL SPR supported? */
-#define CHIP_HAS_VDN_SNOOP_SHIM_CTL() 0
-
-/** Does the chip support rev1 DMA packets? */
-#define CHIP_HAS_REV1_DMA_PACKETS() 0
-
-/** Does the chip have an IPI shim? */
-#define CHIP_HAS_IPI() 0
-
-#endif /* !__OPEN_SOURCE__ */
-#endif /* __ARCH_CHIP_H__ */
diff --git a/arch/tile/include/uapi/arch/opcode_tilegx.h b/arch/tile/include/uapi/arch/opcode_tilegx.h
index c14d02c..d76ff2d 100644
--- a/arch/tile/include/uapi/arch/opcode_tilegx.h
+++ b/arch/tile/include/uapi/arch/opcode_tilegx.h
@@ -61,6 +61,7 @@ typedef tilegx_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEGX_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEGX_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEGX_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEGX_BPT_BUNDLE 0x286a44ae51485000ULL
diff --git a/arch/tile/include/uapi/arch/opcode_tilepro.h b/arch/tile/include/uapi/arch/opcode_tilepro.h
index 71b763b..4451cff 100644
--- a/arch/tile/include/uapi/arch/opcode_tilepro.h
+++ b/arch/tile/include/uapi/arch/opcode_tilepro.h
@@ -71,6 +71,7 @@ typedef tilepro_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEPRO_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEPRO_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEPRO_BPT_BUNDLE 0x400b3cae70166000ULL
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..78daa31 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -200,8 +200,6 @@
 #define SPR_SIM_CONTROL 0x4e0c
 #define SPR_SNCTL 0x0805
 #define SPR_SNCTL__FRZFABRIC_MASK  0x1
-#define SPR_SNCTL__FRZPROC_MASK  0x2
-#define SPR_SNPC 0x080b
 #define SPR_SNSTATIC 0x080c
 #define SPR_SYSTEM_SAVE_0_0 0x4b00
 #define SPR_SYSTEM_SAVE_0_1 0x4b01
diff --git a/arch/tile/include/uapi/asm/auxvec.h b/arch/tile/include/uapi/asm/auxvec.h
index 1d393ed..c93e927 100644
--- a/arch/tile/include/uapi/asm/auxvec.h
+++ b/arch/tile/include/uapi/asm/auxvec.h
@@ -15,6 +15,7 @@
 #ifndef _ASM_TILE_AUXVEC_H
 #define _ASM_TILE_AUXVEC_H
 
-/* No extensions to auxvec */
+/* The vDSO location. */
+#define AT_SYSINFO_EHDR         33
 
 #endif /* _ASM_TILE_AUXVEC_H */
diff --git a/arch/tile/include/uapi/asm/cachectl.h b/arch/tile/include/uapi/asm/cachectl.h
index af4c9f9..572ddca 100644
--- a/arch/tile/include/uapi/asm/cachectl.h
+++ b/arch/tile/include/uapi/asm/cachectl.h
@@ -29,8 +29,8 @@
  * to honor the arguments at some point.)
  *
  * Flush and invalidation of memory can normally be performed with the
- * __insn_flush(), __insn_inv(), and __insn_finv() instructions from
- * userspace.  The DCACHE option to the system call allows userspace
+ * __insn_flush() and __insn_finv() instructions from userspace.
+ * The DCACHE option to the system call allows userspace
  * to flush the entire L1+L2 data cache from the core.  In this case,
  * the address and length arguments are not used.  The DCACHE flush is
  * restricted to the current core, not all cores in the address space.
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index 5334be8..27a2bf3 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -3,11 +3,17 @@
 #
 
 extra-y := vmlinux.lds head_$(BITS).o
-obj-y := backtrace.o entry.o irq.o messaging.o \
+obj-y := backtrace.o entry.o hvglue.o irq.o messaging.o \
 	pci-dma.o proc.o process.o ptrace.o reboot.o \
-	setup.o signal.o single_step.o stack.o sys.o sysfs.o time.o traps.o \
+	setup.o signal.o single_step.o stack.o sys.o \
+	sysfs.o time.o traps.o unaligned.o vdso.o \
 	intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o
 
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+endif
+
 obj-$(CONFIG_HARDWALL)		+= hardwall.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_signal.o
 obj-$(CONFIG_SMP)		+= smpboot.o smp.o tlb.o
@@ -20,3 +26,9 @@ else
 obj-$(CONFIG_PCI)		+= pci.o
 endif
 obj-$(CONFIG_TILE_USB)		+= usb.o
+obj-$(CONFIG_TILE_HVGLUE_TRACE)	+= hvglue_trace.o
+obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o mcount_64.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_KGDB)		+= kgdb.o
+
+obj-y				+= vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 01ddf19..375e7c3 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -14,13 +14,6 @@
  * Generates definitions from c-type structures used by assembly sources.
  */
 
-#include <linux/kbuild.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-#include <linux/ptrace.h>
-#include <hv/hypervisor.h>
-
 /* Check for compatible compiler early in the build. */
 #ifdef CONFIG_TILEGX
 # ifndef __tilegx__
@@ -31,46 +24,61 @@
 # endif
 #else
 # ifdef __tilegx__
-#  error Can not build TILEPro/TILE64 configurations with tilegx compiler
+#  error Can not build TILEPro configurations with tilegx compiler
 # endif
 #endif
 
+#include <linux/kbuild.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/ptrace.h>
+#include <hv/hypervisor.h>
+
 void foo(void)
 {
-	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET,
 	       offsetof(struct single_step_state, buffer));
-	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET,
 	       offsetof(struct single_step_state, flags));
-	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET,
 	       offsetof(struct single_step_state, orig_pc));
-	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, next_pc));
-	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, branch_next_pc));
-	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET,
 	       offsetof(struct single_step_state, update_value));
 
-	DEFINE(THREAD_INFO_TASK_OFFSET, \
+	DEFINE(THREAD_INFO_TASK_OFFSET,
 	       offsetof(struct thread_info, task));
-	DEFINE(THREAD_INFO_FLAGS_OFFSET, \
+	DEFINE(THREAD_INFO_FLAGS_OFFSET,
 	       offsetof(struct thread_info, flags));
-	DEFINE(THREAD_INFO_STATUS_OFFSET, \
+	DEFINE(THREAD_INFO_STATUS_OFFSET,
 	       offsetof(struct thread_info, status));
-	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET, \
+	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET,
 	       offsetof(struct thread_info, homecache_cpu));
-	DEFINE(THREAD_INFO_STEP_STATE_OFFSET, \
+	DEFINE(THREAD_INFO_PREEMPT_COUNT_OFFSET,
+	       offsetof(struct thread_info, preempt_count));
+	DEFINE(THREAD_INFO_STEP_STATE_OFFSET,
 	       offsetof(struct thread_info, step_state));
+#ifdef __tilegx__
+	DEFINE(THREAD_INFO_UNALIGN_JIT_BASE_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_base));
+	DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_tmp));
+#endif
 
 	DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
 	       offsetof(struct task_struct, thread.ksp));
 	DEFINE(TASK_STRUCT_THREAD_PC_OFFSET,
 	       offsetof(struct task_struct, thread.pc));
 
-	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET, \
+	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET,
 	       offsetof(HV_Topology, width));
-	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET, \
+	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET,
 	       offsetof(HV_Topology, height));
 
-	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET, \
+	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET,
 	       offsetof(irq_cpustat_t, irq_syscall_count));
 }
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index d0a052e..85e00b2 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -32,6 +32,7 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
 struct compat_ucontext {
@@ -227,7 +228,7 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ptr_to_compat_reg(ka->sa.sa_restorer);
 
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index 34d72a1..b608e00 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -23,19 +23,24 @@
 
 static void early_hv_write(struct console *con, const char *s, unsigned n)
 {
-	hv_console_write((HV_VirtAddr) s, n);
+	tile_console_write(s, n);
+
+	/*
+	 * Convert NL to NLCR (close enough to CRNL) during early boot.
+	 * We assume newlines are at the ends of strings, which turns out
+	 * to be good enough for early boot console output.
+	 */
+	if (n && s[n-1] == '\n')
+		tile_console_write("\r", 1);
 }
 
 static struct console early_hv_console = {
 	.name =		"earlyhv",
 	.write =	early_hv_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static int early_console_complete;
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -43,51 +48,21 @@ void early_panic(const char *fmt, ...)
 	va_start(ap, fmt);
 	early_printk("Kernel panic - not syncing: ");
 	early_vprintk(fmt, ap);
-	early_console->write(early_console, "\n", 1);
+	early_printk("\n");
 	va_end(ap);
 	dump_stack();
 	hv_halt();
 }
 
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *str)
 {
 	if (early_console)
 		return 1;
 
-	if (str != NULL && strncmp(str, "keep", 4) == 0)
-		keep_early = 1;
-
 	early_console = &early_hv_console;
 	register_console(early_console);
 
 	return 0;
 }
 
-void __init disable_early_printk(void)
-{
-	early_console_complete = 1;
-	if (!early_console)
-		return;
-	if (!keep_early) {
-		early_printk("disabling early console\n");
-		unregister_console(early_console);
-		early_console = NULL;
-	} else {
-		early_printk("keeping early console\n");
-	}
-}
-
-void warn_early_printk(void)
-{
-	if (early_console_complete || early_console)
-		return;
-	early_printk("\
-Machine shutting down before console output is fully initialized.\n\
-You may wish to reboot and add the option 'earlyprintk' to your\n\
-boot command line to see any diagnostic early console output.\n\
-");
-}
-
 early_param("earlyprintk", setup_early_printk);
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S
index f116cb0..3d91759 100644
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -27,22 +27,6 @@ STD_ENTRY(current_text_addr)
 	{ move r0, lr; jrp lr }
 	STD_ENDPROC(current_text_addr)
 
-/*
- * We don't run this function directly, but instead copy it to a page
- * we map into every user process.  See vdso_setup().
- *
- * Note that libc has a copy of this function that it uses to compare
- * against the PC when a stack backtrace ends, so if this code is
- * changed, the libc implementation(s) should also be updated.
- */
-	.pushsection .data
-ENTRY(__rt_sigreturn)
-	moveli TREG_SYSCALL_NR_NAME,__NR_rt_sigreturn
-	swint1
-	ENDPROC(__rt_sigreturn)
-	ENTRY(__rt_sigreturn_end)
-	.popsection
-
 STD_ENTRY(dump_stack)
 	{ move r2, lr; lnk r1 }
 	{ move r4, r52; addli r1, r1, dump_stack - . }
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
new file mode 100644
index 0000000..f1c4520
--- /dev/null
+++ b/arch/tile/kernel/ftrace.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific ftrace support
+ */
+
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+#include <asm/sections.h>
+
+#include <arch/opcode.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static inline tilegx_bundle_bits NOP(void)
+{
+	return create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+		create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+		create_Opcode_X0(RRR_0_OPCODE_X0) |
+		create_UnaryOpcodeExtension_X1(NOP_UNARY_OPCODE_X1) |
+		create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) |
+		create_Opcode_X1(RRR_0_OPCODE_X1);
+}
+
+static int machine_stopped __read_mostly;
+
+int ftrace_arch_code_modify_prepare(void)
+{
+	machine_stopped = 1;
+	return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+	flush_icache_range(0, CHIP_L1I_CACHE_SIZE());
+	machine_stopped = 0;
+	return 0;
+}
+
+/*
+ * Put { move r10, lr; jal ftrace_caller } in a bundle, this lets dynamic
+ * tracer just add one cycle overhead to every kernel function when disabled.
+ */
+static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr,
+				       bool link)
+{
+	tilegx_bundle_bits opcode_x0, opcode_x1;
+	long pcrel_by_instr = (addr - pc) >> TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES;
+
+	if (link) {
+		/* opcode: jal addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(JAL_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	} else {
+		/* opcode: j addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(J_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	}
+
+	if (addr == FTRACE_ADDR) {
+		/* opcode: or r10, lr, zero */
+		opcode_x0 =
+			create_Dest_X0(10) |
+			create_SrcA_X0(TREG_LR) |
+			create_SrcB_X0(TREG_ZERO) |
+			create_RRROpcodeExtension_X0(OR_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	} else {
+		/* opcode: fnop */
+		opcode_x0 =
+			create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+			create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	}
+
+	return opcode_x1 | opcode_x0;
+}
+
+static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
+{
+	return NOP();
+}
+
+static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+	return ftrace_gen_branch(pc, addr, true);
+}
+
+static int ftrace_modify_code(unsigned long pc, unsigned long old,
+			      unsigned long new)
+{
+	unsigned long pc_wr;
+
+	/* Check if the address is in kernel text space and module space. */
+	if (!kernel_text_address(pc))
+		return -EINVAL;
+
+	/* Operate on writable kernel text mapping. */
+	pc_wr = pc - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)pc_wr, &new, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	smp_wmb();
+
+	if (!machine_stopped && num_online_cpus() > 1)
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long pc, old;
+	unsigned long new;
+	int ret;
+
+	pc = (unsigned long)&ftrace_call;
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, (unsigned long)func);
+
+	ret = ftrace_modify_code(pc, old, new);
+
+	return ret;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long new, old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace(rec);
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned long old;
+	unsigned long new;
+	int ret;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace(rec);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	*(unsigned long *)data = 0;
+
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+			   unsigned long frame_pointer)
+{
+	unsigned long return_hooker = (unsigned long) &return_to_handler;
+	struct ftrace_graph_ent trace;
+	unsigned long old;
+	int err;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	old = *parent;
+	*parent = return_hooker;
+
+	err = ftrace_push_return_trace(old, self_addr, &trace.depth,
+				       frame_pointer);
+	if (err == -EBUSY) {
+		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_graph_call;
+
+static int __ftrace_modify_caller(unsigned long *callsite,
+				  void (*func) (void), bool enable)
+{
+	unsigned long caller_fn = (unsigned long) func;
+	unsigned long pc = (unsigned long) callsite;
+	unsigned long branch = ftrace_gen_branch(pc, caller_fn, false);
+	unsigned long nop = NOP();
+	unsigned long old = enable ? nop : branch;
+	unsigned long new = enable ? branch : nop;
+
+	return ftrace_modify_code(pc, old, new);
+}
+
+static int ftrace_modify_graph_caller(bool enable)
+{
+	int ret;
+
+	ret = __ftrace_modify_caller(&ftrace_graph_call,
+				     ftrace_graph_caller,
+				     enable);
+
+	return ret;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(true);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(false);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 38ac189..df27a1f 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -272,9 +272,9 @@ static void hardwall_setup_func(void *info)
 	struct hardwall_info *r = info;
 	struct hardwall_type *hwt = r->type;
 
-	int cpu = smp_processor_id();
-	int x = cpu % smp_width;
-	int y = cpu / smp_width;
+	int cpu = smp_processor_id();  /* on_each_cpu disables preemption */
+	int x = cpu_x(cpu);
+	int y = cpu_y(cpu);
 	int bits = 0;
 	if (x == r->ulhc_x)
 		bits |= W_PROTECT;
@@ -317,6 +317,7 @@ static void hardwall_protect_rectangle(struct hardwall_info *r)
 	on_each_cpu_mask(&rect_cpus, hardwall_setup_func, r, 1);
 }
 
+/* Entered from INT_xDN_FIREWALL interrupt vector with irqs disabled. */
 void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 {
 	struct hardwall_info *rect;
@@ -325,7 +326,6 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	struct siginfo info;
 	int cpu = smp_processor_id();
 	int found_processes;
-	unsigned long flags;
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	irq_enter();
@@ -346,7 +346,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	BUG_ON(hwt->disabled);
 
 	/* This tile trapped a network access; find the rectangle. */
-	spin_lock_irqsave(&hwt->lock, flags);
+	spin_lock(&hwt->lock);
 	list_for_each_entry(rect, &hwt->list, list) {
 		if (cpumask_test_cpu(cpu, &rect->cpumask))
 			break;
@@ -401,7 +401,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 		pr_notice("hardwall: no associated processes!\n");
 
  done:
-	spin_unlock_irqrestore(&hwt->lock, flags);
+	spin_unlock(&hwt->lock);
 
 	/*
 	 * We have to disable firewall interrupts now, or else when we
@@ -540,6 +540,14 @@ static struct hardwall_info *hardwall_create(struct hardwall_type *hwt,
 		}
 	}
 
+	/*
+	 * Eliminate cpus that are not part of this Linux client.
+	 * Note that this allows for configurations that we might not want to
+	 * support, such as one client on every even cpu, another client on
+	 * every odd cpu.
+	 */
+	cpumask_and(&info->cpumask, &info->cpumask, cpu_online_mask);
+
 	/* Confirm it doesn't overlap and add it to the list. */
 	spin_lock_irqsave(&hwt->lock, flags);
 	list_for_each_entry(iter, &hwt->list, list) {
@@ -612,7 +620,7 @@ static int hardwall_activate(struct hardwall_info *info)
 
 /*
  * Deactivate a task's hardwall.  Must hold lock for hardwall_type.
- * This method may be called from free_task(), so we don't want to
+ * This method may be called from exit_thread(), so we don't want to
  * rely on too many fields of struct task_struct still being valid.
  * We assume the cpus_allowed, pid, and comm fields are still valid.
  */
@@ -653,7 +661,7 @@ static int hardwall_deactivate(struct hardwall_type *hwt,
 		return -EINVAL;
 
 	printk(KERN_DEBUG "Pid %d (%s) deactivated for %s hardwall: cpu %d\n",
-	       task->pid, task->comm, hwt->name, smp_processor_id());
+	       task->pid, task->comm, hwt->name, raw_smp_processor_id());
 	return 0;
 }
 
@@ -795,8 +803,8 @@ static void reset_xdn_network_state(struct hardwall_type *hwt)
 	/* Reset UDN coordinates to their standard value */
 	{
 		unsigned int cpu = smp_processor_id();
-		unsigned int x = cpu % smp_width;
-		unsigned int y = cpu / smp_width;
+		unsigned int x = cpu_x(cpu);
+		unsigned int y = cpu_y(cpu);
 		__insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7));
 	}
 
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index ac11530..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -39,12 +39,12 @@ ENTRY(_start)
 	}
 	{
 	  moveli r0, _HV_VERSION_OLD_HV_INIT
-	  jal hv_init
+	  jal _hv_init
 	}
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 	/* Install the default page table */
 	{
@@ -64,7 +64,7 @@ ENTRY(_start)
 	  auli r0, r0, ha16(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  inv r6
+	  finv r6
 	  move r1, zero   /* high 32 bits of CPA is zero */
 	}
 	{
@@ -73,12 +73,12 @@ ENTRY(_start)
 	}
 	{
 	  auli lr, lr, ha16(1f)
-	  j hv_install_context
+	  j _hv_install_context
 	}
 1:
 
 	/* Get our processor number and save it away in SAVE_K_0. */
-	jal hv_inquire_topology
+	jal _hv_inquire_topology
 	mulll_uu r4, r1, r2        /* r1 == y, r2 == width */
 	add r4, r4, r0             /* r0 == x, so r4 == cpu == y*width + x */
 
@@ -86,7 +86,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -126,7 +126,6 @@ ENTRY(_start)
 	lw sp, r1
 	or r4, sp, r4
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
@@ -163,8 +162,8 @@ ENTRY(swapper_pg_dir)
 	.set addr, addr + PGDIR_SIZE
 	.endr
 
-	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
-	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+	/* The true text VAs are mapped as VA = PA + MEM_SV_START */
+	PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
 	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 6093964..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -25,6 +25,15 @@
 #include <arch/chip.h>
 #include <arch/spr_def.h>
 
+/* Extract two 32-bit bit values that were read into one register. */
+#ifdef __BIG_ENDIAN__
+#define GET_FIRST_INT(rd, rs) shrsi rd, rs, 32
+#define GET_SECOND_INT(rd, rs) addxi rd, rs, 0
+#else
+#define GET_FIRST_INT(rd, rs) addxi rd, rs, 0
+#define GET_SECOND_INT(rd, rs) shrsi rd, rs, 32
+#endif
+
 /*
  * This module contains the entry code for kernel images. It performs the
  * minimal setup needed to call the generic C routines.
@@ -46,11 +55,11 @@ ENTRY(_start)
 	  movei r2, TILE_CHIP_REV
 	  movei r3, KERNEL_PL
 	}
-	jal hv_init
+	jal _hv_init
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 
 	/*
@@ -61,7 +70,7 @@ ENTRY(_start)
 	 * other CPUs should see a properly-constructed page table.
 	 */
 	{
-	  v4int_l r2, zero, r0    /* ASID for hv_install_context */
+	  GET_FIRST_INT(r2, r0)    /* ASID for hv_install_context */
 	  moveli r4, hw1_last(swapper_pgprot - PAGE_OFFSET)
 	}
 	{
@@ -77,7 +86,7 @@ ENTRY(_start)
 	{
 	  /* After initializing swapper_pgprot, HV_PTE_GLOBAL is set. */
 	  bfextu r7, r1, HV_PTE_INDEX_GLOBAL, HV_PTE_INDEX_GLOBAL
-	  inv r4
+	  finv r4
 	}
 	bnez r7, .Lno_write
 	{
@@ -121,29 +130,24 @@ ENTRY(_start)
 	}
 	{
 	  moveli r3, CTX_PAGE_FLAG
-	  j hv_install_context
+	  j _hv_install_context
 	}
 1:
 
 	/* Install the interrupt base. */
-	moveli r0, hw2_last(MEM_SV_START)
-	shl16insli r0, r0, hw1(MEM_SV_START)
-	shl16insli r0, r0, hw0(MEM_SV_START)
+	moveli r0, hw2_last(intrpt_start)
+	shl16insli r0, r0, hw1(intrpt_start)
+	shl16insli r0, r0, hw0(intrpt_start)
 	mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
 
-	/*
-	 * Get our processor number and save it away in SAVE_K_0.
-	 * Extract stuff from the topology structure: r4 = y, r6 = x,
-	 * r5 = width.  FIXME: consider whether we want to just make these
-	 * 64-bit values (and if so fix smp_topology write below, too).
-	 */
-	jal hv_inquire_topology
+	/* Get our processor number and save it away in SAVE_K_0. */
+	jal _hv_inquire_topology
 	{
-	  v4int_l r5, zero, r1    /* r5 = width */
-	  shrui r4, r0, 32        /* r4 = y */
+	  GET_FIRST_INT(r5, r1)   /* r5 = width */
+	  GET_SECOND_INT(r4, r0)  /* r4 = y */
 	}
 	{
-	  v4int_l r6, zero, r0    /* r6 = x */
+	  GET_FIRST_INT(r6, r0)   /* r6 = x */
 	  mul_lu_lu r4, r4, r5
 	}
 	{
@@ -154,7 +158,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp with at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -198,9 +202,9 @@ ENTRY(_start)
 	}
 	ld r0, r0
 	ld sp, r1
-	or r4, sp, r4
+	shli r4, r4, CPU_SHIFT
+	bfins r4, sp, 0, CPU_SHIFT-1
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
new file mode 100644
index 0000000..2ab4566
--- /dev/null
+++ b/arch/tile/kernel/hvglue.S
@@ -0,0 +1,74 @@
+/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
+.macro gensym sym, val, size
+.org \val
+.global _\sym
+.type _\sym,function
+_\sym:
+.size _\sym,\size
+#ifndef CONFIG_TILE_HVGLUE_TRACE
+.globl \sym
+.set \sym,_\sym
+#endif
+.endm
+
+.section .hvglue,"x",@nobits
+.align 8
+gensym hv_init, 0x20, 32
+gensym hv_install_context, 0x40, 32
+gensym hv_sysconf, 0x60, 32
+gensym hv_get_rtc, 0x80, 32
+gensym hv_set_rtc, 0xa0, 32
+gensym hv_flush_asid, 0xc0, 32
+gensym hv_flush_page, 0xe0, 32
+gensym hv_flush_pages, 0x100, 32
+gensym hv_restart, 0x120, 32
+gensym hv_halt, 0x140, 32
+gensym hv_power_off, 0x160, 32
+gensym hv_inquire_physical, 0x180, 32
+gensym hv_inquire_memory_controller, 0x1a0, 32
+gensym hv_inquire_virtual, 0x1c0, 32
+gensym hv_inquire_asid, 0x1e0, 32
+gensym hv_nanosleep, 0x200, 32
+gensym hv_console_read_if_ready, 0x220, 32
+gensym hv_console_write, 0x240, 32
+gensym hv_downcall_dispatch, 0x260, 32
+gensym hv_inquire_topology, 0x280, 32
+gensym hv_fs_findfile, 0x2a0, 32
+gensym hv_fs_fstat, 0x2c0, 32
+gensym hv_fs_pread, 0x2e0, 32
+gensym hv_physaddr_read64, 0x300, 32
+gensym hv_physaddr_write64, 0x320, 32
+gensym hv_get_command_line, 0x340, 32
+gensym hv_set_caching, 0x360, 32
+gensym hv_bzero_page, 0x380, 32
+gensym hv_register_message_state, 0x3a0, 32
+gensym hv_send_message, 0x3c0, 32
+gensym hv_receive_message, 0x3e0, 32
+gensym hv_inquire_context, 0x400, 32
+gensym hv_start_all_tiles, 0x420, 32
+gensym hv_dev_open, 0x440, 32
+gensym hv_dev_close, 0x460, 32
+gensym hv_dev_pread, 0x480, 32
+gensym hv_dev_pwrite, 0x4a0, 32
+gensym hv_dev_poll, 0x4c0, 32
+gensym hv_dev_poll_cancel, 0x4e0, 32
+gensym hv_dev_preada, 0x500, 32
+gensym hv_dev_pwritea, 0x520, 32
+gensym hv_flush_remote, 0x540, 32
+gensym hv_console_putc, 0x560, 32
+gensym hv_inquire_tiles, 0x580, 32
+gensym hv_confstr, 0x5a0, 32
+gensym hv_reexec, 0x5c0, 32
+gensym hv_set_command_line, 0x5e0, 32
+gensym hv_clear_intr, 0x600, 32
+gensym hv_enable_intr, 0x620, 32
+gensym hv_disable_intr, 0x640, 32
+gensym hv_raise_intr, 0x660, 32
+gensym hv_trigger_ipi, 0x680, 32
+gensym hv_store_mapping, 0x6a0, 32
+gensym hv_inquire_realpa, 0x6c0, 32
+gensym hv_flush_all, 0x6e0, 32
+gensym hv_get_ipi_pte, 0x700, 32
+gensym hv_set_pte_super_shift, 0x720, 32
+gensym hv_console_set_ipi, 0x7e0, 32
+gensym hv_glue_internals, 0x800, 30720
diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds
deleted file mode 100644
index d44c5a6..0000000
--- a/arch/tile/kernel/hvglue.lds
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
-hv_init = TEXT_OFFSET + 0x10020;
-hv_install_context = TEXT_OFFSET + 0x10040;
-hv_sysconf = TEXT_OFFSET + 0x10060;
-hv_get_rtc = TEXT_OFFSET + 0x10080;
-hv_set_rtc = TEXT_OFFSET + 0x100a0;
-hv_flush_asid = TEXT_OFFSET + 0x100c0;
-hv_flush_page = TEXT_OFFSET + 0x100e0;
-hv_flush_pages = TEXT_OFFSET + 0x10100;
-hv_restart = TEXT_OFFSET + 0x10120;
-hv_halt = TEXT_OFFSET + 0x10140;
-hv_power_off = TEXT_OFFSET + 0x10160;
-hv_inquire_physical = TEXT_OFFSET + 0x10180;
-hv_inquire_memory_controller = TEXT_OFFSET + 0x101a0;
-hv_inquire_virtual = TEXT_OFFSET + 0x101c0;
-hv_inquire_asid = TEXT_OFFSET + 0x101e0;
-hv_nanosleep = TEXT_OFFSET + 0x10200;
-hv_console_read_if_ready = TEXT_OFFSET + 0x10220;
-hv_console_write = TEXT_OFFSET + 0x10240;
-hv_downcall_dispatch = TEXT_OFFSET + 0x10260;
-hv_inquire_topology = TEXT_OFFSET + 0x10280;
-hv_fs_findfile = TEXT_OFFSET + 0x102a0;
-hv_fs_fstat = TEXT_OFFSET + 0x102c0;
-hv_fs_pread = TEXT_OFFSET + 0x102e0;
-hv_physaddr_read64 = TEXT_OFFSET + 0x10300;
-hv_physaddr_write64 = TEXT_OFFSET + 0x10320;
-hv_get_command_line = TEXT_OFFSET + 0x10340;
-hv_set_caching = TEXT_OFFSET + 0x10360;
-hv_bzero_page = TEXT_OFFSET + 0x10380;
-hv_register_message_state = TEXT_OFFSET + 0x103a0;
-hv_send_message = TEXT_OFFSET + 0x103c0;
-hv_receive_message = TEXT_OFFSET + 0x103e0;
-hv_inquire_context = TEXT_OFFSET + 0x10400;
-hv_start_all_tiles = TEXT_OFFSET + 0x10420;
-hv_dev_open = TEXT_OFFSET + 0x10440;
-hv_dev_close = TEXT_OFFSET + 0x10460;
-hv_dev_pread = TEXT_OFFSET + 0x10480;
-hv_dev_pwrite = TEXT_OFFSET + 0x104a0;
-hv_dev_poll = TEXT_OFFSET + 0x104c0;
-hv_dev_poll_cancel = TEXT_OFFSET + 0x104e0;
-hv_dev_preada = TEXT_OFFSET + 0x10500;
-hv_dev_pwritea = TEXT_OFFSET + 0x10520;
-hv_flush_remote = TEXT_OFFSET + 0x10540;
-hv_console_putc = TEXT_OFFSET + 0x10560;
-hv_inquire_tiles = TEXT_OFFSET + 0x10580;
-hv_confstr = TEXT_OFFSET + 0x105a0;
-hv_reexec = TEXT_OFFSET + 0x105c0;
-hv_set_command_line = TEXT_OFFSET + 0x105e0;
-hv_clear_intr = TEXT_OFFSET + 0x10600;
-hv_enable_intr = TEXT_OFFSET + 0x10620;
-hv_disable_intr = TEXT_OFFSET + 0x10640;
-hv_raise_intr = TEXT_OFFSET + 0x10660;
-hv_trigger_ipi = TEXT_OFFSET + 0x10680;
-hv_store_mapping = TEXT_OFFSET + 0x106a0;
-hv_inquire_realpa = TEXT_OFFSET + 0x106c0;
-hv_flush_all = TEXT_OFFSET + 0x106e0;
-hv_get_ipi_pte = TEXT_OFFSET + 0x10700;
-hv_set_pte_super_shift = TEXT_OFFSET + 0x10720;
-hv_glue_internals = TEXT_OFFSET + 0x10740;
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
new file mode 100644
index 0000000..85c74ad
--- /dev/null
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Pull in the hypervisor header so we declare all the ABI functions
+ * with the underscore versions, then undef the names so that we can
+ * provide our own wrapper versions.
+ */
+#define hv_init _hv_init
+#define hv_install_context _hv_install_context
+#define hv_sysconf _hv_sysconf
+#define hv_get_rtc _hv_get_rtc
+#define hv_set_rtc _hv_set_rtc
+#define hv_flush_asid _hv_flush_asid
+#define hv_flush_page _hv_flush_page
+#define hv_flush_pages _hv_flush_pages
+#define hv_restart _hv_restart
+#define hv_halt _hv_halt
+#define hv_power_off _hv_power_off
+#define hv_inquire_physical _hv_inquire_physical
+#define hv_inquire_memory_controller _hv_inquire_memory_controller
+#define hv_inquire_virtual _hv_inquire_virtual
+#define hv_inquire_asid _hv_inquire_asid
+#define hv_nanosleep _hv_nanosleep
+#define hv_console_read_if_ready _hv_console_read_if_ready
+#define hv_console_write _hv_console_write
+#define hv_downcall_dispatch _hv_downcall_dispatch
+#define hv_inquire_topology _hv_inquire_topology
+#define hv_fs_findfile _hv_fs_findfile
+#define hv_fs_fstat _hv_fs_fstat
+#define hv_fs_pread _hv_fs_pread
+#define hv_physaddr_read64 _hv_physaddr_read64
+#define hv_physaddr_write64 _hv_physaddr_write64
+#define hv_get_command_line _hv_get_command_line
+#define hv_set_caching _hv_set_caching
+#define hv_bzero_page _hv_bzero_page
+#define hv_register_message_state _hv_register_message_state
+#define hv_send_message _hv_send_message
+#define hv_receive_message _hv_receive_message
+#define hv_inquire_context _hv_inquire_context
+#define hv_start_all_tiles _hv_start_all_tiles
+#define hv_dev_open _hv_dev_open
+#define hv_dev_close _hv_dev_close
+#define hv_dev_pread _hv_dev_pread
+#define hv_dev_pwrite _hv_dev_pwrite
+#define hv_dev_poll _hv_dev_poll
+#define hv_dev_poll_cancel _hv_dev_poll_cancel
+#define hv_dev_preada _hv_dev_preada
+#define hv_dev_pwritea _hv_dev_pwritea
+#define hv_flush_remote _hv_flush_remote
+#define hv_console_putc _hv_console_putc
+#define hv_inquire_tiles _hv_inquire_tiles
+#define hv_confstr _hv_confstr
+#define hv_reexec _hv_reexec
+#define hv_set_command_line _hv_set_command_line
+#define hv_clear_intr _hv_clear_intr
+#define hv_enable_intr _hv_enable_intr
+#define hv_disable_intr _hv_disable_intr
+#define hv_raise_intr _hv_raise_intr
+#define hv_trigger_ipi _hv_trigger_ipi
+#define hv_store_mapping _hv_store_mapping
+#define hv_inquire_realpa _hv_inquire_realpa
+#define hv_flush_all _hv_flush_all
+#define hv_get_ipi_pte _hv_get_ipi_pte
+#define hv_set_pte_super_shift _hv_set_pte_super_shift
+#define hv_console_set_ipi _hv_console_set_ipi
+#include <hv/hypervisor.h>
+#undef hv_init
+#undef hv_install_context
+#undef hv_sysconf
+#undef hv_get_rtc
+#undef hv_set_rtc
+#undef hv_flush_asid
+#undef hv_flush_page
+#undef hv_flush_pages
+#undef hv_restart
+#undef hv_halt
+#undef hv_power_off
+#undef hv_inquire_physical
+#undef hv_inquire_memory_controller
+#undef hv_inquire_virtual
+#undef hv_inquire_asid
+#undef hv_nanosleep
+#undef hv_console_read_if_ready
+#undef hv_console_write
+#undef hv_downcall_dispatch
+#undef hv_inquire_topology
+#undef hv_fs_findfile
+#undef hv_fs_fstat
+#undef hv_fs_pread
+#undef hv_physaddr_read64
+#undef hv_physaddr_write64
+#undef hv_get_command_line
+#undef hv_set_caching
+#undef hv_bzero_page
+#undef hv_register_message_state
+#undef hv_send_message
+#undef hv_receive_message
+#undef hv_inquire_context
+#undef hv_start_all_tiles
+#undef hv_dev_open
+#undef hv_dev_close
+#undef hv_dev_pread
+#undef hv_dev_pwrite
+#undef hv_dev_poll
+#undef hv_dev_poll_cancel
+#undef hv_dev_preada
+#undef hv_dev_pwritea
+#undef hv_flush_remote
+#undef hv_console_putc
+#undef hv_inquire_tiles
+#undef hv_confstr
+#undef hv_reexec
+#undef hv_set_command_line
+#undef hv_clear_intr
+#undef hv_enable_intr
+#undef hv_disable_intr
+#undef hv_raise_intr
+#undef hv_trigger_ipi
+#undef hv_store_mapping
+#undef hv_inquire_realpa
+#undef hv_flush_all
+#undef hv_get_ipi_pte
+#undef hv_set_pte_super_shift
+#undef hv_console_set_ipi
+
+/*
+ * Provide macros based on <linux/syscalls.h> to provide a wrapper
+ * function that invokes the same function with an underscore prefix.
+ * We can't use the existing __SC_xxx macros because we need to
+ * support up to nine arguments rather than up to six, and also this
+ * way the file stands alone from possible changes in the
+ * implementation of <linux/syscalls.h>.
+ */
+#define HV_WRAP0(type, name)					\
+	type name(void);					\
+	type name(void)						\
+	{							\
+		return _##name();				\
+	}
+#define __HV_DECL1(t1, a1)	t1 a1
+#define __HV_DECL2(t2, a2, ...) t2 a2, __HV_DECL1(__VA_ARGS__)
+#define __HV_DECL3(t3, a3, ...) t3 a3, __HV_DECL2(__VA_ARGS__)
+#define __HV_DECL4(t4, a4, ...) t4 a4, __HV_DECL3(__VA_ARGS__)
+#define __HV_DECL5(t5, a5, ...) t5 a5, __HV_DECL4(__VA_ARGS__)
+#define __HV_DECL6(t6, a6, ...) t6 a6, __HV_DECL5(__VA_ARGS__)
+#define __HV_DECL7(t7, a7, ...) t7 a7, __HV_DECL6(__VA_ARGS__)
+#define __HV_DECL8(t8, a8, ...) t8 a8, __HV_DECL7(__VA_ARGS__)
+#define __HV_DECL9(t9, a9, ...) t9 a9, __HV_DECL8(__VA_ARGS__)
+#define __HV_PASS1(t1, a1)	a1
+#define __HV_PASS2(t2, a2, ...) a2, __HV_PASS1(__VA_ARGS__)
+#define __HV_PASS3(t3, a3, ...) a3, __HV_PASS2(__VA_ARGS__)
+#define __HV_PASS4(t4, a4, ...) a4, __HV_PASS3(__VA_ARGS__)
+#define __HV_PASS5(t5, a5, ...) a5, __HV_PASS4(__VA_ARGS__)
+#define __HV_PASS6(t6, a6, ...) a6, __HV_PASS5(__VA_ARGS__)
+#define __HV_PASS7(t7, a7, ...) a7, __HV_PASS6(__VA_ARGS__)
+#define __HV_PASS8(t8, a8, ...) a8, __HV_PASS7(__VA_ARGS__)
+#define __HV_PASS9(t9, a9, ...) a9, __HV_PASS8(__VA_ARGS__)
+#define HV_WRAPx(x, type, name, ...)				\
+	type name(__HV_DECL##x(__VA_ARGS__));			\
+	type name(__HV_DECL##x(__VA_ARGS__))			\
+	{							\
+		return _##name(__HV_PASS##x(__VA_ARGS__));	\
+	}
+#define HV_WRAP1(type, name, ...) HV_WRAPx(1, type, name, __VA_ARGS__)
+#define HV_WRAP2(type, name, ...) HV_WRAPx(2, type, name, __VA_ARGS__)
+#define HV_WRAP3(type, name, ...) HV_WRAPx(3, type, name, __VA_ARGS__)
+#define HV_WRAP4(type, name, ...) HV_WRAPx(4, type, name, __VA_ARGS__)
+#define HV_WRAP5(type, name, ...) HV_WRAPx(5, type, name, __VA_ARGS__)
+#define HV_WRAP6(type, name, ...) HV_WRAPx(6, type, name, __VA_ARGS__)
+#define HV_WRAP7(type, name, ...) HV_WRAPx(7, type, name, __VA_ARGS__)
+#define HV_WRAP8(type, name, ...) HV_WRAPx(8, type, name, __VA_ARGS__)
+#define HV_WRAP9(type, name, ...) HV_WRAPx(9, type, name, __VA_ARGS__)
+
+/* List all the hypervisor API functions. */
+HV_WRAP4(void, hv_init, HV_VersionNumber, interface_version_number,
+	 int, chip_num, int, chip_rev_num, int, client_pl)
+HV_WRAP1(long, hv_sysconf, HV_SysconfQuery, query)
+HV_WRAP3(int, hv_confstr, HV_ConfstrQuery, query, HV_VirtAddr, buf, int, len)
+#if CHIP_HAS_IPI()
+HV_WRAP3(int, hv_get_ipi_pte, HV_Coord, tile, int, pl, HV_PTE*, pte)
+HV_WRAP3(int, hv_console_set_ipi, int, ipi, int, event, HV_Coord, coord);
+#else
+HV_WRAP1(void, hv_enable_intr, HV_IntrMask, enab_mask)
+HV_WRAP1(void, hv_disable_intr, HV_IntrMask, disab_mask)
+HV_WRAP1(void, hv_clear_intr, HV_IntrMask, clear_mask)
+HV_WRAP1(void, hv_raise_intr, HV_IntrMask, raise_mask)
+HV_WRAP2(HV_Errno, hv_trigger_ipi, HV_Coord, tile, int, interrupt)
+#endif /* !CHIP_HAS_IPI() */
+HV_WRAP3(int, hv_store_mapping, HV_VirtAddr, va, unsigned int, len,
+	 HV_PhysAddr, pa)
+HV_WRAP2(HV_PhysAddr, hv_inquire_realpa, HV_PhysAddr, cpa, unsigned int, len)
+HV_WRAP0(HV_RTCTime, hv_get_rtc)
+HV_WRAP1(void, hv_set_rtc, HV_RTCTime, time)
+HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
+	 HV_ASID, asid, __hv32, flags)
+HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
+HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
+HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
+HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
+	 unsigned long, size)
+HV_WRAP1(int, hv_flush_all, int, preserve_global)
+HV_WRAP2(void, hv_restart, HV_VirtAddr, cmd, HV_VirtAddr, args)
+HV_WRAP0(void, hv_halt)
+HV_WRAP0(void, hv_power_off)
+HV_WRAP1(int, hv_reexec, HV_PhysAddr, entry)
+HV_WRAP0(HV_Topology, hv_inquire_topology)
+HV_WRAP3(HV_Errno, hv_inquire_tiles, HV_InqTileSet, set, HV_VirtAddr, cpumask,
+	 int, length)
+HV_WRAP1(HV_PhysAddrRange, hv_inquire_physical, int, idx)
+HV_WRAP2(HV_MemoryControllerInfo, hv_inquire_memory_controller, HV_Coord, coord,
+	 int, controller)
+HV_WRAP1(HV_VirtAddrRange, hv_inquire_virtual, int, idx)
+HV_WRAP1(HV_ASIDRange, hv_inquire_asid, int, idx)
+HV_WRAP1(void, hv_nanosleep, int, nanosecs)
+HV_WRAP0(int, hv_console_read_if_ready)
+HV_WRAP1(void, hv_console_putc, int, byte)
+HV_WRAP2(int, hv_console_write, HV_VirtAddr, bytes, int, len)
+HV_WRAP0(void, hv_downcall_dispatch)
+HV_WRAP1(int, hv_fs_findfile, HV_VirtAddr, filename)
+HV_WRAP1(HV_FS_StatInfo, hv_fs_fstat, int, inode)
+HV_WRAP4(int, hv_fs_pread, int, inode, HV_VirtAddr, buf,
+	 int, length, int, offset)
+HV_WRAP2(unsigned long long, hv_physaddr_read64, HV_PhysAddr, addr,
+	 HV_PTE, access)
+HV_WRAP3(void, hv_physaddr_write64, HV_PhysAddr, addr, HV_PTE, access,
+	 unsigned long long, val)
+HV_WRAP2(int, hv_get_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP2(HV_Errno, hv_set_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP1(void, hv_set_caching, unsigned long, bitmask)
+HV_WRAP2(void, hv_bzero_page, HV_VirtAddr, va, unsigned int, size)
+HV_WRAP1(HV_Errno, hv_register_message_state, HV_MsgState*, msgstate)
+HV_WRAP4(int, hv_send_message, HV_Recipient *, recips, int, nrecip,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP3(HV_RcvMsgInfo, hv_receive_message, HV_MsgState, msgstate,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP0(void, hv_start_all_tiles)
+HV_WRAP2(int, hv_dev_open, HV_VirtAddr, name, __hv32, flags)
+HV_WRAP1(int, hv_dev_close, int, devhdl)
+HV_WRAP5(int, hv_dev_pread, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP5(int, hv_dev_pwrite, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP3(int, hv_dev_poll, int, devhdl, __hv32, events, HV_IntArg, intarg)
+HV_WRAP1(int, hv_dev_poll_cancel, int, devhdl)
+HV_WRAP6(int, hv_dev_preada, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP6(int, hv_dev_pwritea, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP9(int, hv_flush_remote, HV_PhysAddr, cache_pa,
+	 unsigned long, cache_control, unsigned long*, cache_cpumask,
+	 HV_VirtAddr, tlb_va, unsigned long, tlb_length,
+	 unsigned long, tlb_pgsize, unsigned long*, tlb_cpumask,
+	 HV_Remote_ASID*, asids, int, asidcount)
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index cb52d66..088d5c1 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -28,20 +28,10 @@
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
-#if !CHIP_HAS_WH64()
-	/* By making this an empty macro, we can use wh64 in the code. */
-	.macro  wh64 reg
-	.endm
-#endif
-
 	.macro  push_reg reg, ptr=sp, delta=-4
 	{
 	 sw     \ptr, \reg
@@ -189,7 +179,7 @@ intvec_\vecname:
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, r0, zero, LOG2_THREAD_SIZE, 31
+	mm      r0, r0, zero, LOG2_NR_CPU_IDS, 31
 
 0:
 	/*
@@ -207,6 +197,9 @@ intvec_\vecname:
 	 *    cache line 1: r14...r29
 	 *    cache line 0: 2 x frame, r0..r13
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -326,18 +319,14 @@ intvec_\vecname:
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.ifc \c_routine, op_handle_aux_perf_interrupt
 	{
 	 mfspr  r2, AUX_PERF_COUNT_STS
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#endif
 	movei   r3, 0
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -354,7 +343,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -468,7 +457,7 @@ intvec_\vecname:
 	}
 	{
 	 auli   r21, r21, ha16(__per_cpu_offset)
-	 mm     r20, r20, zero, 0, LOG2_THREAD_SIZE-1
+	 mm     r20, r20, zero, 0, LOG2_NR_CPU_IDS-1
 	}
 	s2a     r20, r20, r21
 	lw      tp, r20
@@ -562,7 +551,6 @@ intvec_\vecname:
 	.endif
 	mtspr   INTERRUPT_CRITICAL_SECTION, zero
 
-#if CHIP_HAS_WH64()
 	/*
 	 * Prepare the first 256 stack bytes to be rapidly accessible
 	 * without having to fetch the background data.  We don't really
@@ -583,7 +571,6 @@ intvec_\vecname:
 	 addi   r52, r52, -64
 	}
 	wh64    r52
-#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	.ifnc \function,handle_nmi
@@ -762,7 +749,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -812,17 +799,34 @@ STD_ENTRY(interrupt_return)
 	}
 	lw      r29, r29
 	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	bzt     r29, .Lresume_userspace
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	GET_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
 	{
-	 bzt    r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 lw     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
 	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 lw     r29, r29
+	}
+	bzt     r28, 1f
+	bnz     r29, 1f
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
 
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
 	{
-	 lw     r28, r29
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
 	 moveli r27, lo16(_cpu_idle_nap)
 	}
 	{
+	 lw     r28, r29
 	 auli   r27, r27, ha16(_cpu_idle_nap)
 	}
 	{
@@ -1420,7 +1424,6 @@ handle_ill:
 	{
 	 lw     r0, r0          /* indirect thru thread_info to get task_info*/
 	 addi   r1, sp, C_ABI_SAVE_AREA_SIZE  /* put ptregs pointer into r1 */
-	 move   r2, zero        /* load error code into r2 */
 	}
 
 	jal     send_sigtrap    /* issue a SIGTRAP */
@@ -1518,12 +1521,10 @@ STD_ENTRY(_sys_clone)
 	__HEAD
 	.align 64
 	/* Align much later jump on the start of a cache line. */
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	nop
 #if PAGE_SIZE >= 0x10000
 	nop
 #endif
-#endif
 ENTRY(sys_cmpxchg)
 
 	/*
@@ -1557,45 +1558,6 @@ ENTRY(sys_cmpxchg)
 # error Code here assumes PAGE_OFFSET can be loaded with just hi16()
 #endif
 
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	{
-	 /* Check for unaligned input. */
-	 bnz    sp, .Lcmpxchg_badaddr
-	 mm     r25, r0, zero, 3, PAGE_SHIFT-1
-	}
-	{
-	 crc32_32 r25, zero, r25
-	 moveli r21, lo16(atomic_lock_ptr)
-	}
-	{
-	 auli   r21, r21, ha16(atomic_lock_ptr)
-	 auli   r23, zero, hi16(PAGE_OFFSET)  /* hugepage-aligned */
-	}
-	{
-	 shri	r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
-	 slt_u  r23, r0, r23
-	 lw	r26, r0  /* see comment in the "#else" for the "lw r26". */
-	}
-	{
-	 s2a    r21, r20, r21
-	 bbns   r23, .Lcmpxchg_badaddr
-	}
-	{
-	 lw     r21, r21
-	 seqi	r23, TREG_SYSCALL_NR_NAME, __NR_FAST_cmpxchg64
-	 andi	r25, r25, ATOMIC_HASH_L2_SIZE - 1
-	}
-	{
-	 /* Branch away at this point if we're doing a 64-bit cmpxchg. */
-	 bbs    r23, .Lcmpxchg64
-	 andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
-	}
-	{
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-	 j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 	{
 	 /* Check for unaligned input. */
 	 bnz    sp, .Lcmpxchg_badaddr
@@ -1609,7 +1571,7 @@ ENTRY(sys_cmpxchg)
 	  * Because of C pointer arithmetic, we want to compute this:
 	  *
 	  * ((char*)atomic_locks +
-	  *  (((r0 >> 3) & (1 << (ATOMIC_HASH_SIZE - 1))) << 2))
+	  *  (((r0 >> 3) & ((1 << ATOMIC_HASH_SHIFT) - 1)) << 2))
 	  *
 	  * Instead of two shifts we just ">> 1", and use 'mm'
 	  * to ignore the low and high bits we don't want.
@@ -1620,12 +1582,9 @@ ENTRY(sys_cmpxchg)
 
 	 /*
 	  * Ensure that the TLB is loaded before we take out the lock.
-	  * On tilepro, this will start fetching the value all the way
-	  * into our L1 as well (and if it gets modified before we
-	  * grab the lock, it will be invalidated from our cache
-	  * before we reload it).  On tile64, we'll start fetching it
-	  * into our L1 if we're the home, and if we're not, we'll
-	  * still at least start fetching it into the home's L2.
+	  * This will start fetching the value all the way into our L1
+	  * as well (and if it gets modified before we grab the lock,
+	  * it will be invalidated from our cache before we reload it).
 	  */
 	 lw	r26, r0
 	}
@@ -1668,8 +1627,6 @@ ENTRY(sys_cmpxchg)
 	 j      .Lcmpxchg32_tns
 	}
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* Symbol for do_page_fault_ics() to use to compare against the PC. */
 .global __sys_cmpxchg_grab_lock
 __sys_cmpxchg_grab_lock:
@@ -1807,9 +1764,6 @@ __sys_cmpxchg_grab_lock:
 	.align 64
 .Lcmpxchg64:
 	{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-#endif
 	 bzt     r23, .Lcmpxchg64_tns
 	}
 	j       .Lcmpxchg_badaddr
@@ -1875,8 +1829,8 @@ int_unalign:
 	push_extra_callee_saves r0
 	j       do_trap
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
@@ -1944,10 +1898,8 @@ int_unalign:
 		     do_page_fault
 	int_hand     INT_SN_CPL, SN_CPL, bad_intr
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	int_hand     INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \
 		     op_handle_aux_perf_interrupt, handle_nmi
-#endif
 
 	/* Synthetic interrupt delivered only by the simulator */
 	int_hand     INT_BREAKPOINT, BREAKPOINT, do_breakpoint
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 85d4839..ec755d3 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -17,25 +17,33 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
+#include <linux/init.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/asm-offsets.h>
 #include <asm/types.h>
+#include <asm/traps.h>
 #include <asm/signal.h>
 #include <hv/hypervisor.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set).  Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
 
 	.macro  push_reg reg, ptr=sp, delta=-8
 	{
@@ -98,6 +106,185 @@
 	}
 	.endm
 
+	/*
+	 * Unalign data exception fast handling: In order to handle
+	 * unaligned data access, a fast JIT version is generated and stored
+	 * in a specific area in user space. We first need to do a quick poke
+	 * to see if the JIT is available. We use certain bits in the fault
+	 * PC (3 to 9 is used for 16KB page size) as index to address the JIT
+	 * code area. The first 64bit word is the fault PC, and the 2nd one is
+	 * the fault bundle itself. If these 2 words both match, then we
+	 * directly "iret" to JIT code. If not, a slow path is invoked to
+	 * generate new JIT code. Note: the current JIT code WILL be
+	 * overwritten if it existed. So, ideally we can handle 128 unalign
+	 * fixups via JIT. For lookup efficiency and to effectively support
+	 * tight loops with multiple unaligned reference, a simple
+	 * direct-mapped cache is used.
+	 *
+	 * SPR_EX_CONTEXT_K_0 is modified to return to JIT code.
+	 * SPR_EX_CONTEXT_K_1 has ICS set.
+	 * SPR_EX_CONTEXT_0_0 is setup to user program's next PC.
+	 * SPR_EX_CONTEXT_0_1 = 0.
+	 */
+	.macro int_hand_unalign_fast  vecnum, vecname
+	.org  (\vecnum << 8)
+intvec_\vecname:
+	/* Put r3 in SPR_SYSTEM_SAVE_K_1.  */
+	mtspr   SPR_SYSTEM_SAVE_K_1, r3
+
+	mfspr   r3, SPR_EX_CONTEXT_K_1
+	/*
+	 * Examine if exception comes from user without ICS set.
+	 * If not, just go directly to the slow path.
+	 */
+	bnez    r3, hand_unalign_slow_nonuser
+
+	mfspr   r3, SPR_SYSTEM_SAVE_K_0
+
+	/* Get &thread_info->unalign_jit_tmp[0] in r3. */
+	bfexts  r3, r3, 0, CPU_SHIFT-1
+	mm      r3, zero, LOG2_THREAD_SIZE, 63
+	addli   r3, r3, THREAD_INFO_UNALIGN_JIT_TMP_OFFSET
+
+	/*
+	 * Save r0, r1, r2 into thread_info array r3 points to
+	 * from low to high memory in order.
+	 */
+	st_add  r3, r0, 8
+	st_add  r3, r1, 8
+	{
+	 st_add r3, r2, 8
+	 andi   r2, sp, 7
+	}
+
+	/* Save stored r3 value so we can revert it on a page fault. */
+	mfspr   r1, SPR_SYSTEM_SAVE_K_1
+	st      r3, r1
+
+	{
+	 /* Generate a SIGBUS if sp is not 8-byte aligned. */
+	 bnez   r2, hand_unalign_slow_badsp
+	}
+
+	/*
+	 * Get the thread_info in r0; load r1 with pc. Set the low bit of sp
+	 * as an indicator to the page fault code in case we fault.
+	 */
+	{
+	 ori    sp, sp, 1
+	 mfspr  r1, SPR_EX_CONTEXT_K_0
+	}
+
+	/* Add the jit_info offset in thread_info; extract r1 [3:9] into r2. */
+	{
+	 addli  r0, r3, THREAD_INFO_UNALIGN_JIT_BASE_OFFSET - \
+	  (THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + (3 * 8))
+	 bfextu r2, r1, 3, (2 + PAGE_SHIFT - UNALIGN_JIT_SHIFT)
+	}
+
+	/* Load the jit_info; multiply r2 by 128. */
+	{
+	 ld     r0, r0
+	 shli   r2, r2, UNALIGN_JIT_SHIFT
+	}
+
+	/*
+	 * If r0 is NULL, the JIT page is not mapped, so go to slow path;
+	 * add offset r2 to r0 at the same time.
+	 */
+	{
+	 beqz   r0, hand_unalign_slow
+	 add    r2, r0, r2
+	}
+
+        /*
+	 * We are loading from userspace (both the JIT info PC and
+	 * instruction word, and the instruction word we executed)
+	 * and since either could fault while holding the interrupt
+	 * critical section, we must tag this region and check it in
+	 * do_page_fault() to handle it properly.
+	 */
+ENTRY(__start_unalign_asm_code)
+
+	/* Load first word of JIT in r0 and increment r2 by 8. */
+	ld_add  r0, r2, 8
+
+	/*
+	 * Compare the PC with the 1st word in JIT; load the fault bundle
+	 * into r1.
+	 */
+	{
+	 cmpeq  r0, r0, r1
+	 ld     r1, r1
+	}
+
+	/* Go to slow path if PC doesn't match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * Load the 2nd word of JIT, which is supposed to be the fault
+	 * bundle for a cache hit. Increment r2; after this bundle r2 will
+	 * point to the potential start of the JIT code we want to run.
+	 */
+	ld_add  r0, r2, 8
+
+	/* No further accesses to userspace are done after this point. */
+ENTRY(__end_unalign_asm_code)
+
+	/* Compare the real bundle with what is saved in the JIT area. */
+	{
+	 cmpeq  r0, r1, r0
+	 mtspr  SPR_EX_CONTEXT_0_1, zero
+	}
+
+	/* Go to slow path if the fault bundle does not match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * A cache hit is found.
+	 * r2 points to start of JIT code (3rd word).
+	 * r0 is the fault pc.
+	 * r1 is the fault bundle.
+	 * Reset the low bit of sp.
+	 */
+	{
+	 mfspr  r0, SPR_EX_CONTEXT_K_0
+	 andi   sp, sp, ~1
+	}
+
+	/* Write r2 into EX_CONTEXT_K_0 and increment PC. */
+	{
+	 mtspr  SPR_EX_CONTEXT_K_0, r2
+	 addi   r0, r0, 8
+	}
+
+	/*
+	 * Set ICS on kernel EX_CONTEXT_K_1 in order to "iret" to
+	 * user with ICS set. This way, if the JIT fixup causes another
+	 * unalign exception (which shouldn't be possible) the user
+	 * process will be terminated with SIGBUS. Also, our fixup will
+	 * run without interleaving with external interrupts.
+	 * Each fixup is at most 14 bundles, so it won't hold ICS for long.
+	 */
+	{
+	 movei  r1, PL_ICS_EX1(USER_PL, 1)
+	 mtspr  SPR_EX_CONTEXT_0_0, r0
+	}
+
+	{
+	 mtspr  SPR_EX_CONTEXT_K_1, r1
+	 addi   r3, r3, -(3 * 8)
+	}
+
+	/* Restore r0..r3. */
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld_add  r2, r3, 8
+	ld      r3, r3
+
+	iret
+	ENDPROC(intvec_\vecname)
+	.endm
 
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
@@ -118,15 +305,21 @@ intvec_feedback:
 	 * The "processing" argument specifies the code for processing
 	 * the interrupt. Defaults to "handle_interrupt".
 	 */
-	.macro  int_hand vecnum, vecname, c_routine, processing=handle_interrupt
-	.org    (\vecnum << 8)
+	.macro __int_hand vecnum, vecname, c_routine,processing=handle_interrupt
 intvec_\vecname:
 	/* Temporarily save a register so we have somewhere to work. */
 
 	mtspr   SPR_SYSTEM_SAVE_K_1, r0
 	mfspr   r0, SPR_EX_CONTEXT_K_1
 
-	andi    r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	/*
+	 * The unalign data fastpath code sets the low bit in sp to
+	 * force us to reset it here on fault.
+	 */
+	{
+	 blbs   sp, 2f
+	 IS_KERNEL_EX1(r0, r0)
+	}
 
 	.ifc    \vecnum, INT_DOUBLE_FAULT
 	/*
@@ -176,15 +369,15 @@ intvec_\vecname:
 	}
 	.endif
 
-
+2:
 	/*
-	 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and
-	 * the current stack top in the higher bits.  So we recover
-	 * our stack top by just masking off the low bits, then
+	 * SYSTEM_SAVE_K_0 holds the cpu number in the high bits, and
+	 * the current stack top in the lower bits.  So we recover
+	 * our starting stack value by sign-extending the low bits, then
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, zero, LOG2_THREAD_SIZE, 63
+	bfexts  r0, r0, 0, CPU_SHIFT-1
 
 0:
 	/*
@@ -206,6 +399,9 @@ intvec_\vecname:
 	 *    cache line 1: r6...r13
 	 *    cache line 0: 2 x frame, r0..r5
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -305,7 +501,7 @@ intvec_\vecname:
 	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
 	.else
 	.ifc \vecnum, INT_ILL_TRANS
-	mfspr   r2, ILL_TRANS_REASON
+	mfspr   r2, ILL_VA_PC
 	.else
 	.ifc \vecnum, INT_DOUBLE_FAULT
 	mfspr   r2, SPR_SYSTEM_SAVE_K_2   /* double fault info from HV */
@@ -315,12 +511,10 @@ intvec_\vecname:
 	.else
 	.ifc \c_routine, op_handle_perf_interrupt
 	mfspr   r2, PERF_COUNT_STS
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.else
 	.ifc \c_routine, op_handle_aux_perf_interrupt
 	mfspr   r2, AUX_PERF_COUNT_STS
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -339,7 +533,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -455,11 +649,12 @@ intvec_\vecname:
 	/*
 	 * If we will be returning to the kernel, we will need to
 	 * reset the interrupt masks to the state they had before.
-	 * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+	 * Set DISABLE_IRQ in flags iff we came from kernel pl with
+	 * irqs disabled.
 	 */
 	mfspr   r32, SPR_EX_CONTEXT_K_1
 	{
-	 andi   r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(r22, r22)
 	 PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
 	}
 	beqzt   r32, 1f       /* zero if from user space */
@@ -503,7 +698,7 @@ intvec_\vecname:
 	}
 	{
 	 shl16insli r21, r21, hw1(__per_cpu_offset)
-	 bfextu r20, r20, 0, LOG2_THREAD_SIZE-1
+	 bfextu r20, r20, CPU_SHIFT, 63
 	}
 	shl16insli r21, r21, hw0(__per_cpu_offset)
 	shl3add r20, r20, r21
@@ -585,7 +780,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -626,14 +821,36 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
 	}
 	ld      r29, r29
-	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	IS_KERNEL_EX1(r29, r29)
 	{
 	 beqzt  r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 move   r29, sp
+	}
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	EXTRACT_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
+	{
+	 ld     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 ld4s   r29, r29
 	}
+	beqzt   r28, 1f
+	bnez    r29, 1f
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
 
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
-	moveli  r27, hw2_last(_cpu_idle_nap)
+	{
+	 moveli r27, hw2_last(_cpu_idle_nap)
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	}
 	{
 	 ld     r28, r29
 	 shl16insli r27, r27, hw1(_cpu_idle_nap)
@@ -728,7 +945,7 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
 	}
 	{
-	 andi   r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+	 IS_KERNEL_EX1(r0, r0)
 	 ld     r32, r32
 	}
 	bnez    r0, 1f
@@ -799,7 +1016,7 @@ STD_ENTRY(interrupt_return)
 	pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
 	{
 	 mtspr  SPR_EX_CONTEXT_K_1, lr
-	 andi   lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(lr, lr)
 	}
 	{
 	 mtspr  SPR_EX_CONTEXT_K_0, r21
@@ -1223,10 +1440,31 @@ STD_ENTRY(_sys_clone)
 	j       sys_clone
 	STD_ENDPROC(_sys_clone)
 
-/* The single-step support may need to read all the registers. */
+	/*
+	 * Recover r3, r2, r1 and r0 here saved by unalign fast vector.
+	 * The vector area limit is 32 bundles, so we handle the reload here.
+	 * r0, r1, r2 are in thread_info from low to high memory in order.
+	 * r3 points to location the original r3 was saved.
+	 * We put this code in the __HEAD section so it can be reached
+	 * via a conditional branch from the fast path.
+	 */
+	__HEAD
+hand_unalign_slow:
+	andi    sp, sp, ~1
+hand_unalign_slow_badsp:
+	addi    r3, r3, -(3 * 8)
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld      r2, r3
+hand_unalign_slow_nonuser:
+	mfspr   r3, SPR_SYSTEM_SAVE_K_1
+	__int_hand     INT_UNALIGN_DATA, UNALIGN_DATA_SLOW, int_unalign
+
+/* The unaligned data support needs to read all the registers. */
 int_unalign:
 	push_extra_callee_saves r0
-	j       do_trap
+	j       do_unaligned
+ENDPROC(hand_unalign_slow)
 
 /* Fill the return address stack with nonzero entries. */
 STD_ENTRY(fill_ra_stack)
@@ -1240,8 +1478,15 @@ STD_ENTRY(fill_ra_stack)
 4:	jrp	r0
 	STD_ENDPROC(fill_ra_stack)
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+	.macro int_hand  vecnum, vecname, c_routine, processing=handle_interrupt
+	.org   (\vecnum << 8)
+		__int_hand   \vecnum, \vecname, \c_routine, \processing
+	.endm
+
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
+	.global intrpt_start
+intrpt_start:
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
@@ -1272,7 +1517,7 @@ STD_ENTRY(fill_ra_stack)
 	int_hand     INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
 	int_hand     INT_SWINT_0, SWINT_0, do_trap
 	int_hand     INT_ILL_TRANS, ILL_TRANS, do_trap
-	int_hand     INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign
+	int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
 	int_hand     INT_DTLB_MISS, DTLB_MISS, do_page_fault
 	int_hand     INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
 	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 3ccf2cd..0586fdb 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -55,7 +55,8 @@ static DEFINE_PER_CPU(int, irq_depth);
 
 /* State for allocating IRQs on Gx. */
 #if CHIP_HAS_IPI()
-static unsigned long available_irqs = ~(1UL << IRQ_RESCHEDULE);
+static unsigned long available_irqs = ((1UL << NR_IRQS) - 1) &
+				      (~(1UL << IRQ_RESCHEDULE));
 static DEFINE_SPINLOCK(available_irqs_lock);
 #endif
 
@@ -73,7 +74,8 @@ static DEFINE_SPINLOCK(available_irqs_lock);
 
 /*
  * The interrupt handling path, implemented in terms of HV interrupt
- * emulation on TILE64 and TILEPro, and IPI hardware on TILE-Gx.
+ * emulation on TILEPro, and IPI hardware on TILE-Gx.
+ * Entered with interrupts disabled.
  */
 void tile_dev_intr(struct pt_regs *regs, int intnum)
 {
@@ -233,7 +235,7 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type)
 {
 	/*
 	 * We use handle_level_irq() by default because the pending
-	 * interrupt vector (whether modeled by the HV on TILE64 and
+	 * interrupt vector (whether modeled by the HV on
 	 * TILEPro or implemented in hardware on TILE-Gx) has
 	 * level-style semantics for each bit.  An interrupt fires
 	 * whenever a bit is high, not just at edges.
diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c
new file mode 100644
index 0000000..4cd8838
--- /dev/null
+++ b/arch/tile/kernel/kgdb.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#include <linux/ptrace.h>
+#include <linux/kgdb.h>
+#include <linux/kdebug.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+
+static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+static unsigned long stepped_addr;
+static tile_bundle_bits stepped_instr;
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0])},
+	{ "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1])},
+	{ "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2])},
+	{ "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3])},
+	{ "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4])},
+	{ "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5])},
+	{ "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6])},
+	{ "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7])},
+	{ "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8])},
+	{ "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9])},
+	{ "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10])},
+	{ "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11])},
+	{ "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12])},
+	{ "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13])},
+	{ "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14])},
+	{ "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15])},
+	{ "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16])},
+	{ "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17])},
+	{ "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18])},
+	{ "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19])},
+	{ "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20])},
+	{ "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21])},
+	{ "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22])},
+	{ "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23])},
+	{ "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24])},
+	{ "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25])},
+	{ "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26])},
+	{ "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27])},
+	{ "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28])},
+	{ "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29])},
+	{ "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30])},
+	{ "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31])},
+	{ "r32", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[32])},
+	{ "r33", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[33])},
+	{ "r34", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[34])},
+	{ "r35", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[35])},
+	{ "r36", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[36])},
+	{ "r37", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[37])},
+	{ "r38", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[38])},
+	{ "r39", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[39])},
+	{ "r40", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[40])},
+	{ "r41", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[41])},
+	{ "r42", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[42])},
+	{ "r43", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[43])},
+	{ "r44", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[44])},
+	{ "r45", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[45])},
+	{ "r46", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[46])},
+	{ "r47", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[47])},
+	{ "r48", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[48])},
+	{ "r49", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[49])},
+	{ "r50", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[50])},
+	{ "r51", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[51])},
+	{ "r52", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[52])},
+	{ "tp", GDB_SIZEOF_REG, offsetof(struct pt_regs, tp)},
+	{ "sp", GDB_SIZEOF_REG, offsetof(struct pt_regs, sp)},
+	{ "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, lr)},
+	{ "sn", GDB_SIZEOF_REG, -1},
+	{ "idn0", GDB_SIZEOF_REG, -1},
+	{ "idn1", GDB_SIZEOF_REG, -1},
+	{ "udn0", GDB_SIZEOF_REG, -1},
+	{ "udn1", GDB_SIZEOF_REG, -1},
+	{ "udn2", GDB_SIZEOF_REG, -1},
+	{ "udn3", GDB_SIZEOF_REG, -1},
+	{ "zero", GDB_SIZEOF_REG, -1},
+	{ "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, pc)},
+	{ "faultnum", GDB_SIZEOF_REG, offsetof(struct pt_regs, faultnum)},
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+	else
+		memset(mem, 0, dbg_reg_def[regno].size);
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+/*
+ * Similar to pt_regs_to_gdb_regs() except that process is sleeping and so
+ * we may not be able to get all the info.
+ */
+void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
+{
+	int reg;
+	struct pt_regs *thread_regs;
+	unsigned long *ptr = gdb_regs;
+
+	if (task == NULL)
+		return;
+
+	/* Initialize to zero. */
+	memset(gdb_regs, 0, NUMREGBYTES);
+
+	thread_regs = task_pt_regs(task);
+	for (reg = 0; reg <= TREG_LAST_GPR; reg++)
+		*(ptr++) = thread_regs->regs[reg];
+
+	gdb_regs[TILEGX_PC_REGNUM] = thread_regs->pc;
+	gdb_regs[TILEGX_FAULTNUM_REGNUM] = thread_regs->faultnum;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->pc = pc;
+}
+
+static void kgdb_call_nmi_hook(void *ignored)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), NULL);
+}
+
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	local_irq_enable();
+	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
+	local_irq_disable();
+}
+
+/*
+ * Convert a kernel address to the writable kernel text mapping.
+ */
+static unsigned long writable_address(unsigned long addr)
+{
+	unsigned long ret = 0;
+
+	if (core_kernel_text(addr))
+		ret = addr - MEM_SV_START + PAGE_OFFSET;
+	else if (is_module_text_address(addr))
+		ret = addr;
+	else
+		pr_err("Unknown virtual address 0x%lx\n", addr);
+
+	return ret;
+}
+
+/*
+ * Calculate the new address for after a step.
+ */
+static unsigned long get_step_address(struct pt_regs *regs)
+{
+	int src_reg;
+	int jump_off;
+	int br_off;
+	unsigned long addr;
+	unsigned int opcode;
+	tile_bundle_bits bundle;
+
+	/* Move to the next instruction by default. */
+	addr = regs->pc + TILEGX_BUNDLE_SIZE_IN_BYTES;
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				src_reg = get_SrcA_Y1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+		if (get_RRROpcodeExtension_X1(bundle) ==
+		    UNARY_RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_X1:
+			case JALRP_UNARY_OPCODE_X1:
+			case JR_UNARY_OPCODE_X1:
+			case JRP_UNARY_OPCODE_X1:
+				src_reg = get_SrcA_X1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == JUMP_OPCODE_X1) {
+		opcode = get_JumpOpcodeExtension_X1(bundle);
+
+		switch (opcode) {
+		case JAL_JUMP_OPCODE_X1:
+		case J_JUMP_OPCODE_X1:
+			jump_off = sign_extend(get_JumpOff_X1(bundle), 27);
+			addr = regs->pc +
+				(jump_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+			break;
+		}
+	} else if (get_Opcode_X1(bundle) == BRANCH_OPCODE_X1) {
+		br_off = 0;
+		opcode = get_BrType_X1(bundle);
+
+		switch (opcode) {
+		case BEQZT_BRANCH_OPCODE_X1:
+		case BEQZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) == 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGEZT_BRANCH_OPCODE_X1:
+		case BGEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) >= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGTZT_BRANCH_OPCODE_X1:
+		case BGTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) > 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBCT_BRANCH_OPCODE_X1:
+		case BLBC_BRANCH_OPCODE_X1:
+			if (!(get_SrcA_X1(bundle) & 1))
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBST_BRANCH_OPCODE_X1:
+		case BLBS_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) & 1)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLEZT_BRANCH_OPCODE_X1:
+		case BLEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) <= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLTZT_BRANCH_OPCODE_X1:
+		case BLTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) < 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BNEZT_BRANCH_OPCODE_X1:
+		case BNEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) != 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		}
+
+		if (br_off != 0) {
+			br_off = sign_extend(br_off, 17);
+			addr = regs->pc +
+				(br_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+		}
+	}
+
+	return addr;
+}
+
+/*
+ * Replace the next instruction after the current instruction with a
+ * breakpoint instruction.
+ */
+static void do_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	/* Determine where the target instruction will send us to. */
+	stepped_addr = get_step_address(regs);
+	probe_kernel_read((char *)&stepped_instr, (char *)stepped_addr,
+			  BREAK_INSTR_SIZE);
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&singlestep_insn,
+			   BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+static void undo_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	if (stepped_instr == 0)
+		return;
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&stepped_instr,
+			   BREAK_INSTR_SIZE);
+	stepped_instr = 0;
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
+ * then try to fall into the debugger.
+ */
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	int ret;
+	unsigned long flags;
+	struct die_args *args = (struct die_args *)ptr;
+	struct pt_regs *regs = args->regs;
+
+#ifdef CONFIG_KPROBES
+	/*
+	 * Return immediately if the kprobes fault notifier has set
+	 * DIE_PAGE_FAULT.
+	 */
+	if (cmd == DIE_PAGE_FAULT)
+		return NOTIFY_DONE;
+#endif /* CONFIG_KPROBES */
+
+	switch (cmd) {
+	case DIE_BREAK:
+	case DIE_COMPILED_BPT:
+		break;
+	case DIE_SSTEPBP:
+		local_irq_save(flags);
+		kgdb_handle_exception(0, SIGTRAP, 0, regs);
+		local_irq_restore(flags);
+		return NOTIFY_STOP;
+	default:
+		/* Userspace events, ignore. */
+		if (user_mode(regs))
+			return NOTIFY_DONE;
+	}
+
+	local_irq_save(flags);
+	ret = kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
+	local_irq_restore(flags);
+	if (ret)
+		return NOTIFY_DONE;
+
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call = kgdb_notify,
+};
+
+/*
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcom_in_buffer: The buffer of the packet we have read.
+ * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			       char *remcom_in_buffer, char *remcom_out_buffer,
+			       struct pt_regs *regs)
+{
+	char *ptr;
+	unsigned long address;
+
+	/* Undo any stepping we may have done. */
+	undo_single_step(regs);
+
+	switch (remcom_in_buffer[0]) {
+	case 'c':
+	case 's':
+	case 'D':
+	case 'k':
+		/*
+		 * Try to read optional parameter, pc unchanged if no parm.
+		 * If this was a compiled-in breakpoint, we need to move
+		 * to the next instruction or we will just breakpoint
+		 * over and over again.
+		 */
+		ptr = &remcom_in_buffer[1];
+		if (kgdb_hex2long(&ptr, &address))
+			regs->pc = address;
+		else if (*(unsigned long *)regs->pc == compiled_bpt)
+			regs->pc += BREAK_INSTR_SIZE;
+
+		if (remcom_in_buffer[0] == 's') {
+			do_single_step(regs);
+			kgdb_single_step = 1;
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
+		} else
+			atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+		return 0;
+	}
+
+	return -1; /* this means that we do not want to exit from the handler */
+}
+
+struct kgdb_arch arch_kgdb_ops;
+
+/*
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ *
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+	tile_bundle_bits bundle = TILEGX_BPT_BUNDLE;
+
+	memcpy(arch_kgdb_ops.gdb_bpt_instr, &bundle, BREAK_INSTR_SIZE);
+	return register_die_notifier(&kgdb_notifier);
+}
+
+/*
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+				BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	err = probe_kernel_write((char *)addr_wr, arch_kgdb_ops.gdb_bpt_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_write((char *)addr_wr, (char *)bpt->saved_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
diff --git a/arch/tile/kernel/kprobes.c b/arch/tile/kernel/kprobes.c
new file mode 100644
index 0000000..27cdcac
--- /dev/null
+++ b/arch/tile/kernel/kprobes.c
@@ -0,0 +1,528 @@
+/*
+ * arch/tile/kernel/kprobes.c
+ * Kprobes on TILE-Gx
+ *
+ * Some portions copied from the MIPS version.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright 2006 Sony Corp.
+ * Copyright 2010 Cavium Networks
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+#include <arch/opcode.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+tile_bundle_bits breakpoint_insn = TILEGX_BPT_BUNDLE;
+tile_bundle_bits breakpoint2_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+
+/*
+ * Check whether instruction is branch or jump, or if executing it
+ * has different results depending on where it is executed (e.g. lnk).
+ */
+static int __kprobes insn_has_control(kprobe_opcode_t insn)
+{
+	if (get_Mode(insn) != 0) {   /* Y-format bundle */
+		if (get_Opcode_Y1(insn) != RRR_1_OPCODE_Y1 ||
+		    get_RRROpcodeExtension_Y1(insn) != UNARY_RRR_1_OPCODE_Y1)
+			return 0;
+
+		switch (get_UnaryOpcodeExtension_Y1(insn)) {
+		case JALRP_UNARY_OPCODE_Y1:
+		case JALR_UNARY_OPCODE_Y1:
+		case JRP_UNARY_OPCODE_Y1:
+		case JR_UNARY_OPCODE_Y1:
+		case LNK_UNARY_OPCODE_Y1:
+			return 1;
+		default:
+			return 0;
+		}
+	}
+
+	switch (get_Opcode_X1(insn)) {
+	case BRANCH_OPCODE_X1:	/* branch instructions */
+	case JUMP_OPCODE_X1:	/* jump instructions: j and jal */
+		return 1;
+
+	case RRR_0_OPCODE_X1:   /* other jump instructions */
+		if (get_RRROpcodeExtension_X1(insn) != UNARY_RRR_0_OPCODE_X1)
+			return 0;
+		switch (get_UnaryOpcodeExtension_X1(insn)) {
+		case JALRP_UNARY_OPCODE_X1:
+		case JALR_UNARY_OPCODE_X1:
+		case JRP_UNARY_OPCODE_X1:
+		case JR_UNARY_OPCODE_X1:
+		case LNK_UNARY_OPCODE_X1:
+			return 1;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+
+	if (addr & (sizeof(kprobe_opcode_t) - 1))
+		return -EINVAL;
+
+	if (insn_has_control(*p->addr)) {
+		pr_notice("Kprobes for control instructions are not "
+			  "supported\n");
+		return -EINVAL;
+	}
+
+	/* insn: must be on special executable page on tile. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+
+	/*
+	 * In the kprobe->ainsn.insn[] array we store the original
+	 * instruction at index zero and a break trap instruction at
+	 * index one.
+	 */
+	memcpy(&p->ainsn.insn[0], p->addr, sizeof(kprobe_opcode_t));
+	p->ainsn.insn[1] = breakpoint2_insn;
+	p->opcode = *p->addr;
+
+	return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)p->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &breakpoint_insn,
+		sizeof(breakpoint_insn)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(p);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *kp)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)kp->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &kp->opcode,
+		sizeof(kp->opcode)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(kp);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	if (p->ainsn.insn) {
+		free_insn_slot(p->ainsn.insn, 0);
+		p->ainsn.insn = NULL;
+	}
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.saved_pc = kcb->kprobe_saved_pc;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_saved_pc = kcb->prev_kprobe.saved_pc;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+			struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, p);
+	kcb->kprobe_saved_pc = regs->pc;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	/* Single step inline if the instruction is a break. */
+	if (p->opcode == breakpoint_insn ||
+	    p->opcode == breakpoint2_insn)
+		regs->pc = (unsigned long)p->addr;
+	else
+		regs->pc = (unsigned long)&p->ainsn.insn[0];
+}
+
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr;
+	struct kprobe_ctlblk *kcb;
+
+	addr = (kprobe_opcode_t *)regs->pc;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing.
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing. */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+			    p->ainsn.insn[0] == breakpoint_insn) {
+				goto no_kprobe;
+			}
+			/*
+			 * We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			if (*addr != breakpoint_insn) {
+				/*
+				 * The breakpoint instruction was removed by
+				 * another cpu right after we hit, no further
+				 * handling of this interrupt is appropriate.
+				 */
+				ret = 1;
+				goto no_kprobe;
+			}
+			p = __this_cpu_read(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != breakpoint_insn) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it. */
+		goto no_kprobe;
+	}
+
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	if (p->pre_handler && p->pre_handler(p, regs)) {
+		/* Handler has already set things up, so skip ss setup. */
+		return 1;
+	}
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction that has been replaced by the breakpoint. To avoid the
+ * SMP problems that can occur when we temporarily put back the
+ * original opcode to single-step, we single-stepped a copy of the
+ * instruction.  The address of this copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * breakpoint trap.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+				       struct pt_regs *regs,
+				       struct kprobe_ctlblk *kcb)
+{
+	unsigned long orig_pc = kcb->kprobe_saved_pc;
+	regs->pc = orig_pc + 8;
+}
+
+static inline int post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs, kcb);
+
+	/* Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	return 1;
+}
+
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+		return 1;
+
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the ip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		resume_execution(cur, regs, kcb);
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_BREAK:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEPBP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id(). */
+		preempt_disable();
+
+		if (kprobe_running()
+		    && kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_sp = regs->sp;
+
+	memcpy(kcb->jprobes_stack, (void *)kcb->jprobe_saved_sp,
+	       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+
+	regs->pc = (unsigned long)(jp->entry);
+
+	return 1;
+}
+
+/* Defined in the inline asm below. */
+void jprobe_return_end(void);
+
+void __kprobes jprobe_return(void)
+{
+	asm volatile(
+		"bpt\n\t"
+		".globl jprobe_return_end\n"
+		"jprobe_return_end:\n");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (regs->pc >= (unsigned long)jprobe_return &&
+	    regs->pc <= (unsigned long)jprobe_return_end) {
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((void *)kcb->jprobe_saved_sp, kcb->jprobes_stack,
+		       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+		preempt_enable_no_resched();
+
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Function return probe trampoline:
+ * - init_kprobes() establishes a probepoint here
+ * - When the probed function returns, this probe causes the
+ *   handlers to fire
+ */
+static void __used kretprobe_trampoline_holder(void)
+{
+	asm volatile(
+		"nop\n\t"
+		".global kretprobe_trampoline\n"
+		"kretprobe_trampoline:\n\t"
+		"nop\n\t"
+		: : : "memory");
+}
+
+void kretprobe_trampoline(void);
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	ri->ret_addr = (kprobe_opcode_t *) regs->lr;
+
+	/* Replace the return addr with trampoline addr */
+	regs->lr = (unsigned long)kretprobe_trampoline;
+}
+
+/*
+ * Called when the probe at kretprobe trampoline is hit.
+ */
+static int __kprobes trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address = (unsigned long)kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because multiple functions in the call path have
+	 * a return probe installed on them, and/or more than one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address) {
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+		}
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+	instruction_pointer(regs) = orig_ret_address;
+
+	reset_current_kprobe();
+	kretprobe_hash_unlock(current, &flags);
+	preempt_enable_no_resched();
+
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
+	return 1;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline)
+		return 1;
+
+	return 0;
+}
+
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *)kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	register_kprobe(&trampoline_p);
+	return 0;
+}
diff --git a/arch/tile/kernel/mcount_64.S b/arch/tile/kernel/mcount_64.S
new file mode 100644
index 0000000..70d7bb0
--- /dev/null
+++ b/arch/tile/kernel/mcount_64.S
@@ -0,0 +1,224 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific __mcount support
+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define REGSIZE 8
+
+	.text
+	.global __mcount
+
+	.macro	MCOUNT_SAVE_REGS
+	addli	sp, sp, -REGSIZE
+	{
+	 st     sp, lr
+	 addli	r29, sp, - (12 * REGSIZE)
+	}
+	{
+	 addli	sp, sp, - (13 * REGSIZE)
+	 st     r29, sp
+	}
+	addli	r29, r29, REGSIZE
+	{ st	r29, r0; addli	r29, r29, REGSIZE }
+	{ st	r29, r1; addli	r29, r29, REGSIZE }
+	{ st	r29, r2; addli	r29, r29, REGSIZE }
+	{ st	r29, r3; addli	r29, r29, REGSIZE }
+	{ st	r29, r4; addli	r29, r29, REGSIZE }
+	{ st	r29, r5; addli	r29, r29, REGSIZE }
+	{ st	r29, r6; addli	r29, r29, REGSIZE }
+	{ st	r29, r7; addli	r29, r29, REGSIZE }
+	{ st	r29, r8; addli	r29, r29, REGSIZE }
+	{ st	r29, r9; addli	r29, r29, REGSIZE }
+	{ st	r29, r10; addli	r29, r29, REGSIZE }
+	.endm
+
+	.macro	MCOUNT_RESTORE_REGS
+	addli	r29, sp, (2 * REGSIZE)
+	{ ld	r0, r29; addli	r29, r29, REGSIZE }
+	{ ld	r1, r29; addli	r29, r29, REGSIZE }
+	{ ld	r2, r29; addli	r29, r29, REGSIZE }
+	{ ld	r3, r29; addli	r29, r29, REGSIZE }
+	{ ld	r4, r29; addli	r29, r29, REGSIZE }
+	{ ld	r5, r29; addli	r29, r29, REGSIZE }
+	{ ld	r6, r29; addli	r29, r29, REGSIZE }
+	{ ld	r7, r29; addli	r29, r29, REGSIZE }
+	{ ld	r8, r29; addli	r29, r29, REGSIZE }
+	{ ld	r9, r29; addli	r29, r29, REGSIZE }
+	{ ld	r10, r29; addli	lr, sp, (13 * REGSIZE) }
+	{ ld	lr, lr;  addli	sp, sp, (14 * REGSIZE) }
+	.endm
+
+	.macro  RETURN_BACK
+	{ move	r12, lr; move	lr, r10 }
+	jrp	r12
+	.endm
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+	.align	64
+STD_ENTRY(__mcount)
+__mcount:
+	j	ftrace_stub
+STD_ENDPROC(__mcount)
+
+	.align	64
+STD_ENTRY(ftrace_caller)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	.global	ftrace_call
+ftrace_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_trace_function()
+	 */
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.global	ftrace_graph_call
+ftrace_graph_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_graph_caller()
+	 */
+	nop
+#endif
+	MCOUNT_RESTORE_REGS
+	.global	ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+	.align	64
+STD_ENTRY(__mcount)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	{
+	 moveli	r11, hw2_last(ftrace_trace_function)
+	 moveli	r13, hw2_last(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw1(ftrace_trace_function)
+	 shl16insli	r13, r13, hw1(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw0(ftrace_trace_function)
+	 shl16insli	r13, r13, hw0(ftrace_stub)
+	}
+
+	ld	r11, r11
+	sub	r14, r13, r11
+	bnez	r14, static_trace
+
+#ifdef	CONFIG_FUNCTION_GRAPH_TRACER
+	moveli	r15, hw2_last(ftrace_graph_return)
+	shl16insli	r15, r15, hw1(ftrace_graph_return)
+	shl16insli	r15, r15, hw0(ftrace_graph_return)
+	ld	r15, r15
+	sub	r15, r15, r13
+	bnez	r15, ftrace_graph_caller
+
+	{
+	 moveli	r16, hw2_last(ftrace_graph_entry)
+	 moveli	r17, hw2_last(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw1(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw1(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw0(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw0(ftrace_graph_entry_stub)
+	}
+	ld	r16, r16
+	sub	r17, r16, r17
+	bnez	r17, ftrace_graph_caller
+
+#endif
+	RETURN_BACK
+
+static_trace:
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	/* call ftrace_trace_function() */
+	jalr	r11
+
+	MCOUNT_RESTORE_REGS
+
+	.global ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(__mcount)
+
+#endif	/* ! CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+STD_ENTRY(ftrace_graph_caller)
+ftrace_graph_caller:
+#ifndef CONFIG_DYNAMIC_FTRACE
+	MCOUNT_SAVE_REGS
+#endif
+
+	/* arg1: Get the location of the parent's return address */
+	addi	r0, sp, 12 * REGSIZE
+	/* arg2: Get self return address */
+	move	r1, lr
+
+	jal prepare_ftrace_return
+
+	MCOUNT_RESTORE_REGS
+	RETURN_BACK
+STD_ENDPROC(ftrace_graph_caller)
+
+	.global return_to_handler
+return_to_handler:
+	MCOUNT_SAVE_REGS
+
+	jal	ftrace_return_to_handler
+	/* restore the real parent address */
+	move	r11, r0
+
+	MCOUNT_RESTORE_REGS
+	jr	r11
+
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b9fe80e..09b5870 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -36,8 +36,9 @@ static void *tile_dma_alloc_coherent(struct device *dev, size_t size,
 				     dma_addr_t *dma_handle, gfp_t gfp,
 				     struct dma_attrs *attrs)
 {
-	u64 dma_mask = dev->coherent_dma_mask ?: DMA_BIT_MASK(32);
-	int node = dev_to_node(dev);
+	u64 dma_mask = (dev && dev->coherent_dma_mask) ?
+		dev->coherent_dma_mask : DMA_BIT_MASK(32);
+	int node = dev ? dev_to_node(dev) : 0;
 	int order = get_order(size);
 	struct page *pg;
 	dma_addr_t addr;
@@ -256,7 +257,7 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 	BUG_ON(!valid_dma_direction(direction));
 
 	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
-			    dma_address & PAGE_OFFSET, size, direction);
+			    dma_address & (PAGE_SIZE - 1), size, direction);
 }
 
 static void tile_dma_sync_single_for_cpu(struct device *dev,
@@ -357,7 +358,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
 
 	addr = page_to_phys(pg);
 
-	*dma_handle = phys_to_dma(dev, addr);
+	*dma_handle = addr + get_dma_offset(dev);
 
 	return page_address(pg);
 }
@@ -387,7 +388,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 		sg->dma_address = sg_phys(sg);
 		__dma_prep_pa_range(sg->dma_address, sg->length, direction);
 
-		sg->dma_address = phys_to_dma(dev, sg->dma_address);
+		sg->dma_address = sg->dma_address + get_dma_offset(dev);
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 		sg->dma_length = sg->length;
 #endif
@@ -422,7 +423,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
 	BUG_ON(offset + size > PAGE_SIZE);
 	__dma_prep_page(page, offset, size, direction);
 
-	return phys_to_dma(dev, page_to_pa(page) + offset);
+	return page_to_pa(page) + offset + get_dma_offset(dev);
 }
 
 static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
@@ -432,10 +433,10 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 {
 	BUG_ON(!valid_dma_direction(direction));
 
-	dma_address = dma_to_phys(dev, dma_address);
+	dma_address -= get_dma_offset(dev);
 
 	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
-			    dma_address & PAGE_OFFSET, size, direction);
+			    dma_address & (PAGE_SIZE - 1), size, direction);
 }
 
 static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
@@ -445,7 +446,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
 {
 	BUG_ON(!valid_dma_direction(direction));
 
-	dma_handle = dma_to_phys(dev, dma_handle);
+	dma_handle -= get_dma_offset(dev);
 
 	__dma_complete_pa_range(dma_handle, size, direction);
 }
@@ -456,7 +457,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
 						enum dma_data_direction
 						direction)
 {
-	dma_handle = dma_to_phys(dev, dma_handle);
+	dma_handle -= get_dma_offset(dev);
 
 	__dma_prep_pa_range(dma_handle, size, direction);
 }
@@ -558,22 +559,47 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 };
 
+static struct dma_map_ops pci_hybrid_dma_ops = {
+	.alloc = tile_swiotlb_alloc_coherent,
+	.free = tile_swiotlb_free_coherent,
+	.map_page = tile_pci_dma_map_page,
+	.unmap_page = tile_pci_dma_unmap_page,
+	.map_sg = tile_pci_dma_map_sg,
+	.unmap_sg = tile_pci_dma_unmap_sg,
+	.sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
+	.sync_single_for_device = tile_pci_dma_sync_single_for_device,
+	.sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
+	.mapping_error = tile_pci_dma_mapping_error,
+	.dma_supported = tile_pci_dma_supported
+};
+
 struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
 #else
 struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 #endif
 EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
+EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
 
 #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	/* Handle legacy PCI devices with limited memory addressability. */
-	if (((dma_ops == gx_pci_dma_map_ops) ||
-	    (dma_ops == gx_legacy_pci_dma_map_ops)) &&
-	    (mask <= DMA_BIT_MASK(32))) {
-		if (mask > dev->archdata.max_direct_dma_addr)
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to full capability for both streams and consistent
+	 * memory access. For 32-bit capable devices, limit the consistent 
+	 * memory DMA range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64))
+			set_dma_ops(dev, gx_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
 			mask = dev->archdata.max_direct_dma_addr;
 	}
 
@@ -584,3 +610,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 #endif
+
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+/*
+ * The generic dma_get_required_mask() uses the highest physical address
+ * (max_pfn) to provide the hint to the PCI drivers regarding 32-bit or
+ * 64-bit DMA configuration. Since TILEGx has I/O TLB/MMU, allowing the
+ * DMAs to use the full 64-bit PCI address space and not limited by
+ * the physical memory space, we always let the PCI devices use
+ * 64-bit DMA if they have that capability, by returning the 64-bit
+ * DMA mask here. The device driver has the option to use 32-bit DMA if
+ * the device is not capable of 64-bit DMA.
+ */
+u64 dma_get_required_mask(struct device *dev)
+{
+	return DMA_BIT_MASK(64);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index 67237d3..b7180e6 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -20,7 +20,6 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/bootmem.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
@@ -52,6 +51,8 @@
  *
  */
 
+static int pci_probe = 1;
+
 /*
  * This flag tells if the platform is TILEmpower that needs
  * special configuration for the PLX switch chip.
@@ -144,6 +145,11 @@ int __init tile_pci_init(void)
 {
 	int i;
 
+	if (!pci_probe) {
+		pr_info("PCI: disabled by boot argument\n");
+		return 0;
+	}
+
 	pr_info("PCI: Searching for controllers...\n");
 
 	/* Re-init number of PCIe controllers to support hot-plug feature. */
@@ -192,7 +198,6 @@ int __init tile_pci_init(void)
 			controller->hv_cfg_fd[0] = hv_cfg_fd0;
 			controller->hv_cfg_fd[1] = hv_cfg_fd1;
 			controller->hv_mem_fd = hv_mem_fd;
-			controller->first_busno = 0;
 			controller->last_busno = 0xff;
 			controller->ops = &tile_cfg_ops;
 
@@ -283,7 +288,7 @@ int __init pcibios_init(void)
 	 * known to require at least 20ms here, but we use a more
 	 * conservative value.
 	 */
-	mdelay(250);
+	msleep(250);
 
 	/* Scan all of the recorded PCI controllers.  */
 	for (i = 0; i < TILE_NUM_PCIE; i++) {
@@ -304,18 +309,10 @@ int __init pcibios_init(void)
 
 			pr_info("PCI: initializing controller #%d\n", i);
 
-			/*
-			 * This comes from the generic Linux PCI driver.
-			 *
-			 * It reads the PCI tree for this bus into the Linux
-			 * data structures.
-			 *
-			 * This is inlined in linux/pci.h and calls into
-			 * pci_scan_bus_parented() in probe.c.
-			 */
 			pci_add_resource(&resources, &ioport_resource);
 			pci_add_resource(&resources, &iomem_resource);
-			bus = pci_scan_root_bus(NULL, 0, controller->ops, controller, &resources);
+			bus = pci_scan_root_bus(NULL, 0, controller->ops,
+						controller, &resources);
 			controller->root_bus = bus;
 			controller->last_busno = bus->busn_res.end;
 		}
@@ -388,6 +385,16 @@ void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling. */
 }
 
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
+{
+	if (!strcmp(str, "off")) {
+		pci_probe = 0;
+		return NULL;
+	}
+	return str;
+}
+
 /*
  * Enable memory and/or address decoding, as appropriate, for the
  * device described by the 'dev' struct.
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 6640e7b..a97a645 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -69,19 +69,32 @@ static int pcie_rc[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
  * a HW PCIe link-training bug. The exact delay is specified with
  * a kernel boot argument in the form of "pcie_rc_delay=T,P,S",
  * where T is the TRIO instance number, P is the port number and S is
- * the delay in seconds. If the delay is not provided, the value
- * will be DEFAULT_RC_DELAY.
+ * the delay in seconds. If the argument is specified, but the delay is
+ * not provided, the value will be DEFAULT_RC_DELAY.
  */
 static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
 
 /* Default number of seconds that the PCIe RC port probe can be delayed. */
 #define DEFAULT_RC_DELAY	10
 
-/* Max number of seconds that the PCIe RC port probe can be delayed. */
-#define MAX_RC_DELAY		20
+/* The PCI I/O space size in each PCI domain. */
+#define IO_SPACE_SIZE		0x10000
+
+/* Provide shorter versions of some very long constant names. */
+#define AUTO_CONFIG_RC	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC
+#define AUTO_CONFIG_RC_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1
+#define AUTO_CONFIG_EP	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT
+#define AUTO_CONFIG_EP_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1
 
 /* Array of the PCIe ports configuration info obtained from the BIB. */
-struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
+struct pcie_trio_ports_property pcie_ports[TILEGX_NUM_TRIO];
+
+/* Number of configured TRIO instances. */
+int num_trio_shims;
 
 /* All drivers share the TRIO contexts defined here. */
 gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
@@ -89,24 +102,21 @@ gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
 /* Pointer to an array of PCIe RC controllers. */
 struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
 int num_rc_controllers;
-static int num_ep_controllers;
 
 static struct pci_ops tile_cfg_ops;
 
 /* Mask of CPUs that should receive PCIe interrupts. */
 static struct cpumask intr_cpus_map;
 
-/*
- * We don't need to worry about the alignment of resources.
- */
+/* We don't need to worry about the alignment of resources. */
 resource_size_t pcibios_align_resource(void *data, const struct resource *res,
-				resource_size_t size, resource_size_t align)
+				       resource_size_t size,
+				       resource_size_t align)
 {
 	return res->start;
 }
 EXPORT_SYMBOL(pcibios_align_resource);
 
-
 /*
  * Pick a CPU to receive and handle the PCIe interrupts, based on the IRQ #.
  * For now, we simply send interrupts to non-dataplane CPUs.
@@ -134,24 +144,19 @@ static int tile_irq_cpu(int irq)
 	return cpu;
 }
 
-/*
- * Open a file descriptor to the TRIO shim.
- */
+/* Open a file descriptor to the TRIO shim. */
 static int tile_pcie_open(int trio_index)
 {
 	gxio_trio_context_t *context = &trio_contexts[trio_index];
 	int ret;
+	int mac;
 
-	/*
-	 * This opens a file descriptor to the TRIO shim.
-	 */
+	/* This opens a file descriptor to the TRIO shim. */
 	ret = gxio_trio_init(context, trio_index);
 	if (ret < 0)
-		return ret;
+		goto gxio_trio_init_failure;
 
-	/*
-	 * Allocate an ASID for the kernel.
-	 */
+	/* Allocate an ASID for the kernel. */
 	ret = gxio_trio_alloc_asids(context, 1, 0, 0);
 	if (ret < 0) {
 		pr_err("PCI: ASID alloc failure on TRIO %d, give up\n",
@@ -189,31 +194,97 @@ static int tile_pcie_open(int trio_index)
 	}
 #endif
 
+	/* Get the properties of the PCIe ports on this TRIO instance. */
+	ret = gxio_trio_get_port_property(context, &pcie_ports[trio_index]);
+	if (ret < 0) {
+		pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		goto get_port_property_failure;
+	}
+
+	context->mmio_base_mac =
+		iorpc_ioremap(context->fd, 0, HV_TRIO_CONFIG_IOREMAP_SIZE);
+	if (context->mmio_base_mac == NULL) {
+		pr_err("PCI: TRIO config space mapping failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		ret = -ENOMEM;
+
+		goto trio_mmio_mapping_failure;
+	}
+
+	/* Check the port strap state which will override the BIB setting. */
+	for (mac = 0; mac < TILEGX_TRIO_PCIES; mac++) {
+		TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+		unsigned int reg_offset;
+
+		/* Ignore ports that are not specified in the BIB. */
+		if (!pcie_ports[trio_index].ports[mac].allow_rc &&
+		    !pcie_ports[trio_index].ports[mac].allow_ep)
+			continue;
+
+		reg_offset =
+			(TRIO_PCIE_INTFC_PORT_CONFIG <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		port_config.word =
+			__gxio_mmio_read(context->mmio_base_mac + reg_offset);
+
+		if (port_config.strap_state != AUTO_CONFIG_RC &&
+		    port_config.strap_state != AUTO_CONFIG_RC_G1) {
+			/*
+			 * If this is really intended to be an EP port, record
+			 * it so that the endpoint driver will know about it.
+			 */
+			if (port_config.strap_state == AUTO_CONFIG_EP ||
+			    port_config.strap_state == AUTO_CONFIG_EP_G1)
+				pcie_ports[trio_index].ports[mac].allow_ep = 1;
+		}
+	}
+
 	return ret;
 
+trio_mmio_mapping_failure:
+get_port_property_failure:
 asid_alloc_failure:
 #ifdef USE_SHARED_PCIE_CONFIG_REGION
 pio_alloc_failure:
 #endif
 	hv_dev_close(context->fd);
+gxio_trio_init_failure:
+	context->fd = -1;
 
 	return ret;
 }
 
-static void
-tilegx_legacy_irq_ack(struct irq_data *d)
+static int __init tile_trio_init(void)
+{
+	int i;
+
+	/* We loop over all the TRIO shims. */
+	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+		if (tile_pcie_open(i) < 0)
+			continue;
+		num_trio_shims++;
+	}
+
+	return 0;
+}
+postcore_initcall(tile_trio_init);
+
+static void tilegx_legacy_irq_ack(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_legacy_irq_mask(struct irq_data *d)
+static void tilegx_legacy_irq_mask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_legacy_irq_unmask(struct irq_data *d)
+static void tilegx_legacy_irq_unmask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
 }
@@ -234,8 +305,7 @@ static struct irq_chip tilegx_legacy_irq_chip = {
  * to Linux which just calls handle_level_irq() after clearing the
  * MAC INTx Assert status bit associated with this interrupt.
  */
-static void
-trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
+static void trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
 	struct pci_controller *controller = irq_desc_get_handler_data(desc);
 	gxio_trio_context_t *trio_context = controller->trio;
@@ -301,9 +371,7 @@ static int tile_init_irqs(struct pci_controller *controller)
 			goto free_irqs;
 		}
 
-		/*
-		 * Register the IRQ handler with the kernel.
-		 */
+		/* Register the IRQ handler with the kernel. */
 		irq_set_chip_and_handler(irq, &tilegx_legacy_irq_chip,
 					trio_handle_level_irq);
 		irq_set_chip_data(irq, (void *)(uint64_t)i);
@@ -320,14 +388,39 @@ free_irqs:
 }
 
 /*
+ * Return 1 if the port is strapped to operate in RC mode.
+ */
+static int
+strapped_for_rc(gxio_trio_context_t *trio_context, int mac)
+{
+	TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+	unsigned int reg_offset;
+
+	/* Check the port configuration. */
+	reg_offset =
+		(TRIO_PCIE_INTFC_PORT_CONFIG <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+	port_config.word =
+		__gxio_mmio_read(trio_context->mmio_base_mac + reg_offset);
+
+	if (port_config.strap_state == AUTO_CONFIG_RC ||
+	    port_config.strap_state == AUTO_CONFIG_RC_G1)
+		return 1;
+	else
+		return 0;
+}
+
+/*
  * Find valid controllers and fill in pci_controller structs for each
  * of them.
  *
- * Returns the number of controllers discovered.
+ * Return the number of controllers discovered.
  */
 int __init tile_pci_init(void)
 {
-	int num_trio_shims = 0;
 	int ctl_index = 0;
 	int i, j;
 
@@ -338,64 +431,62 @@ int __init tile_pci_init(void)
 
 	pr_info("PCI: Searching for controllers...\n");
 
-	/*
-	 * We loop over all the TRIO shims.
-	 */
-	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
-		int ret;
-
-		ret = tile_pcie_open(i);
-		if (ret < 0)
-			continue;
-
-		num_trio_shims++;
-	}
-
 	if (num_trio_shims == 0 || sim_is_simulator())
 		return 0;
 
 	/*
-	 * Now determine which PCIe ports are configured to operate in RC mode.
-	 * We look at the Board Information Block first and then see if there
-	 * are any overriding configuration by the HW strapping pin.
+	 * Now determine which PCIe ports are configured to operate in RC
+	 * mode. There is a differece in the port configuration capability
+	 * between the Gx36 and Gx72 devices.
+	 *
+	 * The Gx36 has configuration capability for each of the 3 PCIe
+	 * interfaces (disable, auto endpoint, auto RC, etc.).
+	 * On the Gx72, you can only select one of the 3 PCIe interfaces per
+	 * TRIO to train automatically. Further, the allowable training modes
+	 * are reduced to four options (auto endpoint, auto RC, stream x1,
+	 * stream x4).
+	 *
+	 * For Gx36 ports, it must be allowed to be in RC mode by the
+	 * Board Information Block, and the hardware strapping pins must be
+	 * set to RC mode.
+	 *
+	 * For Gx72 ports, the port will operate in RC mode if either of the
+	 * following is true:
+	 * 1. It is allowed to be in RC mode by the Board Information Block,
+	 *    and the BIB doesn't allow the EP mode.
+	 * 2. It is allowed to be in either the RC or the EP mode by the BIB,
+	 *    and the hardware strapping pin is set to RC mode.
 	 */
 	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
 		gxio_trio_context_t *context = &trio_contexts[i];
-		int ret;
 
 		if (context->fd < 0)
 			continue;
 
-		ret = hv_dev_pread(context->fd, 0,
-			(HV_VirtAddr)&pcie_ports[i][0],
-			sizeof(struct pcie_port_property) * TILEGX_TRIO_PCIES,
-			GXIO_TRIO_OP_GET_PORT_PROPERTY);
-		if (ret < 0) {
-			pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
-				" on TRIO %d\n", ret, i);
-			continue;
-		}
-
 		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
-			if (pcie_ports[i][j].allow_rc) {
+			int is_rc = 0;
+
+			if (pcie_ports[i].is_gx72 &&
+			    pcie_ports[i].ports[j].allow_rc) {
+				if (!pcie_ports[i].ports[j].allow_ep ||
+				    strapped_for_rc(context, j))
+					is_rc = 1;
+			} else if (pcie_ports[i].ports[j].allow_rc &&
+				   strapped_for_rc(context, j)) {
+				is_rc = 1;
+			}
+			if (is_rc) {
 				pcie_rc[i][j] = 1;
 				num_rc_controllers++;
 			}
-			else if (pcie_ports[i][j].allow_ep) {
-				num_ep_controllers++;
-			}
 		}
 	}
 
-	/*
-	 * Return if no PCIe ports are configured to operate in RC mode.
-	 */
+	/* Return if no PCIe ports are configured to operate in RC mode. */
 	if (num_rc_controllers == 0)
 		return 0;
 
-	/*
-	 * Set the TRIO pointer and MAC index for each PCIe RC port.
-	 */
+	/* Set the TRIO pointer and MAC index for each PCIe RC port. */
 	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
 		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
 			if (pcie_rc[i][j]) {
@@ -411,26 +502,32 @@ int __init tile_pci_init(void)
 	}
 
 out:
-	/*
-	 * Configure each PCIe RC port.
-	 */
+	/* Configure each PCIe RC port. */
 	for (i = 0; i < num_rc_controllers; i++) {
-		/*
-		 * Configure the PCIe MAC to run in RC mode.
-		 */
 
+		/* Configure the PCIe MAC to run in RC mode. */
 		struct pci_controller *controller = &pci_controllers[i];
 
 		controller->index = i;
 		controller->ops = &tile_cfg_ops;
 
+		controller->io_space.start = PCIBIOS_MIN_IO +
+			(i * IO_SPACE_SIZE);
+		controller->io_space.end = controller->io_space.start +
+			IO_SPACE_SIZE - 1;
+		BUG_ON(controller->io_space.end > IO_SPACE_LIMIT);
+		controller->io_space.flags = IORESOURCE_IO;
+		snprintf(controller->io_space_name,
+			 sizeof(controller->io_space_name),
+			 "PCI I/O domain %d", i);
+		controller->io_space.name = controller->io_space_name;
+
 		/*
 		 * The PCI memory resource is located above the PA space.
 		 * For every host bridge, the BAR window or the MMIO aperture
 		 * is in range [3GB, 4GB - 1] of a 4GB space beyond the
 		 * PA space.
 		 */
-
 		controller->mem_offset = TILE_PCI_MEM_START +
 			(i * TILE_PCI_BAR_WINDOW_TOP);
 		controller->mem_space.start = controller->mem_offset +
@@ -458,7 +555,6 @@ static int tile_map_irq(const struct pci_dev *dev, u8 device, u8 pin)
 	return controller->irq_intx_table[pin - 1];
 }
 
-
 static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 {
 	gxio_trio_context_t *trio_context = controller->trio;
@@ -472,9 +568,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 
 	mac = controller->mac;
 
-	/*
-	 * Set our max read request size to be 4KB.
-	 */
+	/* Set our max read request size to be 4KB. */
 	reg_offset =
 		(TRIO_PCIE_RC_DEVICE_CONTROL <<
 			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -483,10 +577,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
 	dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
-						reg_offset);
+					      reg_offset);
 	dev_control.max_read_req_sz = 5;
 	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
-						dev_control.word);
+			    dev_control.word);
 
 	/*
 	 * Set the max payload size supported by this Gx PCIe MAC.
@@ -502,10 +596,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
 	rc_dev_cap.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
-						reg_offset);
+					     reg_offset);
 	rc_dev_cap.mps_sup = 1;
 	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
-						rc_dev_cap.word);
+			    rc_dev_cap.word);
 
 	/* Configure PCI Express MPS setting. */
 	list_for_each_entry(child, &root_bus->children, node)
@@ -528,7 +622,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 				    dev_control.max_payload_size,
 				    dev_control.max_read_req_sz,
 				    mac);
-        if (err < 0) {
+	if (err < 0) {
 		pr_err("PCI: PCIE_CONFIGURE_MAC_MPS_MRS failure, "
 			"MAC %d on TRIO %d\n",
 			mac, controller->trio_index);
@@ -565,21 +659,14 @@ static int setup_pcie_rc_delay(char *str)
 		if (!isdigit(*str))
 			return -EINVAL;
 		delay = simple_strtoul(str, (char **)&str, 10);
-		if (delay > MAX_RC_DELAY)
-			return -EINVAL;
 	}
 
 	rc_delay[trio_index][mac] = delay ? : DEFAULT_RC_DELAY;
-	pr_info("Delaying PCIe RC link training for %u sec"
-		" on MAC %lu on TRIO %lu\n", rc_delay[trio_index][mac],
-		mac, trio_index);
 	return 0;
 }
 early_param("pcie_rc_delay", setup_pcie_rc_delay);
 
-/*
- * PCI initialization entry point, called by subsys_initcall.
- */
+/* PCI initialization entry point, called by subsys_initcall. */
 int __init pcibios_init(void)
 {
 	resource_size_t offset;
@@ -589,35 +676,10 @@ int __init pcibios_init(void)
 
 	tile_pci_init();
 
-	if (num_rc_controllers == 0 && num_ep_controllers == 0)
+	if (num_rc_controllers == 0)
 		return 0;
 
 	/*
-	 * We loop over all the TRIO shims and set up the MMIO mappings.
-	 */
-	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
-		gxio_trio_context_t *context = &trio_contexts[i];
-
-		if (context->fd < 0)
-			continue;
-
-		/*
-		 * Map in the MMIO space for the MAC.
-		 */
-		offset = 0;
-		context->mmio_base_mac =
-			iorpc_ioremap(context->fd, offset,
-				      HV_TRIO_CONFIG_IOREMAP_SIZE);
-		if (context->mmio_base_mac == NULL) {
-			pr_err("PCI: MAC map failure on TRIO %d\n", i);
-
-			hv_dev_close(context->fd);
-			context->fd = -1;
-			continue;
-		}
-	}
-
-	/*
 	 * Delay a bit in case devices aren't ready.  Some devices are
 	 * known to require at least 20ms here, but we use a more
 	 * conservative value.
@@ -628,7 +690,6 @@ int __init pcibios_init(void)
 	for (next_busno = 0, i = 0; i < num_rc_controllers; i++) {
 		struct pci_controller *controller = &pci_controllers[i];
 		gxio_trio_context_t *trio_context = controller->trio;
-		TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
 		TRIO_PCIE_INTFC_PORT_STATUS_t port_status;
 		TRIO_PCIE_INTFC_TX_FIFO_CTL_t tx_fifo_ctl;
 		struct pci_bus *bus;
@@ -645,75 +706,64 @@ int __init pcibios_init(void)
 		mac = controller->mac;
 
 		/*
-		 * Check the port strap state which will override the BIB
-		 * setting.
+		 * Check for PCIe link-up status to decide if we need
+		 * to force the link to come up.
 		 */
-
 		reg_offset =
-			(TRIO_PCIE_INTFC_PORT_CONFIG <<
+			(TRIO_PCIE_INTFC_PORT_STATUS <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
 			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
-				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
 			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
-		port_config.word =
+		port_status.word =
 			__gxio_mmio_read(trio_context->mmio_base_mac +
 					 reg_offset);
-
-		if ((port_config.strap_state !=
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC) &&
-			(port_config.strap_state !=
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1)) {
-			/*
-			 * If this is really intended to be an EP port,
-			 * record it so that the endpoint driver will know about it.
-			 */
-			if (port_config.strap_state ==
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT ||
-			port_config.strap_state ==
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1)
-				pcie_ports[trio_index][mac].allow_ep = 1;
-
-			continue;
+		if (!port_status.dl_up) {
+			if (rc_delay[trio_index][mac]) {
+				pr_info("Delaying PCIe RC TRIO init %d sec"
+					" on MAC %d on TRIO %d\n",
+					rc_delay[trio_index][mac], mac,
+					trio_index);
+				msleep(rc_delay[trio_index][mac] * 1000);
+			}
+			ret = gxio_trio_force_rc_link_up(trio_context, mac);
+			if (ret < 0)
+				pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
+					"MAC %d on TRIO %d\n", mac, trio_index);
 		}
 
-		/*
-		 * Delay the RC link training if needed.
-		 */
-		if (rc_delay[trio_index][mac])
-			msleep(rc_delay[trio_index][mac] * 1000);
-
-		ret = gxio_trio_force_rc_link_up(trio_context, mac);
-		if (ret < 0)
-			pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
-				"MAC %d on TRIO %d\n", mac, trio_index);
-
 		pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i,
 			trio_index, controller->mac);
 
-		/*
-		 * Wait a bit here because some EP devices take longer
-		 * to come up.
-		 */
-		msleep(1000);
-
-		/*
-		 * Check for PCIe link-up status.
-		 */
-
-		reg_offset =
-			(TRIO_PCIE_INTFC_PORT_STATUS <<
-				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
-			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
-				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
-			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+		/* Delay the bus probe if needed. */
+		if (rc_delay[trio_index][mac]) {
+			pr_info("Delaying PCIe RC bus enumerating %d sec"
+				" on MAC %d on TRIO %d\n",
+				rc_delay[trio_index][mac], mac,
+				trio_index);
+			msleep(rc_delay[trio_index][mac] * 1000);
+		} else {
+			/*
+			 * Wait a bit here because some EP devices
+			 * take longer to come up.
+			 */
+			msleep(1000);
+		}
 
+		/* Check for PCIe link-up status again. */
 		port_status.word =
 			__gxio_mmio_read(trio_context->mmio_base_mac +
 					 reg_offset);
 		if (!port_status.dl_up) {
-			pr_err("PCI: link is down, MAC %d on TRIO %d\n",
-				mac, trio_index);
+			if (pcie_ports[trio_index].ports[mac].removable) {
+				pr_info("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
+				pr_info("This is expected if no PCIe card"
+					" is connected to this link\n");
+			} else
+				pr_err("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
 			continue;
 		}
 
@@ -739,7 +789,6 @@ int __init pcibios_init(void)
 		 * Change the device ID so that Linux bus crawl doesn't confuse
 		 * the internal bridge with any Tilera endpoints.
 		 */
-
 		reg_offset =
 			(TRIO_PCIE_RC_DEVICE_ID_VEN_ID <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -752,10 +801,7 @@ int __init pcibios_init(void)
 				    TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT) |
 				    TILERA_VENDOR_ID);
 
-		/*
-		 * Set the internal P2P bridge class code.
-		 */
-
+		/* Set the internal P2P bridge class code. */
 		reg_offset =
 			(TRIO_PCIE_RC_REVISION_ID <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -766,26 +812,22 @@ int __init pcibios_init(void)
 		class_code_revision =
 			__gxio_mmio_read32(trio_context->mmio_base_mac +
 					   reg_offset);
-		class_code_revision = (class_code_revision & 0xff ) |
-					(PCI_CLASS_BRIDGE_PCI << 16);
+		class_code_revision = (class_code_revision & 0xff) |
+			(PCI_CLASS_BRIDGE_PCI << 16);
 
 		__gxio_mmio_write32(trio_context->mmio_base_mac +
 				    reg_offset, class_code_revision);
 
 #ifdef USE_SHARED_PCIE_CONFIG_REGION
 
-		/*
-		 * Map in the MMIO space for the PIO region.
-		 */
+		/* Map in the MMIO space for the PIO region. */
 		offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index) |
 			(((unsigned long long)mac) <<
 			TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT);
 
 #else
 
-		/*
-		 * Alloc a PIO region for PCI config access per MAC.
-		 */
+		/* Alloc a PIO region for PCI config access per MAC. */
 		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
 		if (ret < 0) {
 			pr_err("PCI: PCI CFG PIO alloc failure for mac %d "
@@ -796,9 +838,7 @@ int __init pcibios_init(void)
 
 		trio_context->pio_cfg_index[mac] = ret;
 
-		/*
-		 * For PIO CFG, the bus_address_hi parameter is 0.
-		 */
+		/* For PIO CFG, the bus_address_hi parameter is 0. */
 		ret = gxio_trio_init_pio_region_aux(trio_context,
 			trio_context->pio_cfg_index[mac],
 			mac, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE);
@@ -815,9 +855,15 @@ int __init pcibios_init(void)
 
 #endif
 
+		/*
+		 * To save VMALLOC space, we take advantage of the fact that
+		 * bit 29 in the PIO CFG address format is reserved 0. With
+		 * TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT being 30,
+		 * this cuts VMALLOC space usage from 1GB to 512MB per mac.
+		 */
 		trio_context->mmio_base_pio_cfg[mac] =
-			iorpc_ioremap(trio_context->fd, offset,
-			(1 << TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT));
+			iorpc_ioremap(trio_context->fd, offset, (1UL <<
+			(TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT - 1)));
 		if (trio_context->mmio_base_pio_cfg[mac] == NULL) {
 			pr_err("PCI: PIO map failure for mac %d on TRIO %d\n",
 				mac, trio_index);
@@ -825,9 +871,7 @@ int __init pcibios_init(void)
 			continue;
 		}
 
-		/*
-		 * Initialize the PCIe interrupts.
-		 */
+		/* Initialize the PCIe interrupts. */
 		if (tile_init_irqs(controller)) {
 			pr_err("PCI: IRQs init failure for mac %d on TRIO %d\n",
 				mac, trio_index);
@@ -838,17 +882,16 @@ int __init pcibios_init(void)
 		/*
 		 * The PCI memory resource is located above the PA space.
 		 * The memory range for the PCI root bus should not overlap
-		 * with the physical RAM
+		 * with the physical RAM.
 		 */
 		pci_add_resource_offset(&resources, &controller->mem_space,
 					controller->mem_offset);
-
+		pci_add_resource(&resources, &controller->io_space);
 		controller->first_busno = next_busno;
 		bus = pci_scan_root_bus(NULL, next_busno, controller->ops,
 					controller, &resources);
 		controller->root_bus = bus;
 		next_busno = bus->busn_res.end + 1;
-
 	}
 
 	/* Do machine dependent PCI interrupt routing */
@@ -860,7 +903,6 @@ int __init pcibios_init(void)
 	 * It allocates all of the resources (I/O memory, etc)
 	 * associated with the devices read in above.
 	 */
-
 	pci_assign_unassigned_resources();
 
 	/* Record the I/O resources in the PCI controller structure. */
@@ -868,9 +910,6 @@ int __init pcibios_init(void)
 		struct pci_controller *controller = &pci_controllers[i];
 		gxio_trio_context_t *trio_context = controller->trio;
 		struct pci_bus *root_bus = pci_controllers[i].root_bus;
-		struct pci_bus *next_bus;
-		uint32_t bus_address_hi;
-		struct pci_dev *dev;
 		int ret;
 		int j;
 
@@ -884,43 +923,12 @@ int __init pcibios_init(void)
 		/* Configure the max_payload_size values for this domain. */
 		fixup_read_and_payload_sizes(controller);
 
-		list_for_each_entry(dev, &root_bus->devices, bus_list) {
-			/* Find the PCI host controller, ie. the 1st bridge. */
-			if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
-				(PCI_SLOT(dev->devfn) == 0)) {
-				next_bus = dev->subordinate;
-				pci_controllers[i].mem_resources[0] =
-					*next_bus->resource[0];
-				pci_controllers[i].mem_resources[1] =
-					 *next_bus->resource[1];
-				pci_controllers[i].mem_resources[2] =
-					 *next_bus->resource[2];
-
-				break;
-			}
-		}
-
-		if (pci_controllers[i].mem_resources[1].flags & IORESOURCE_MEM)
-			bus_address_hi =
-				pci_controllers[i].mem_resources[1].start >> 32;
-		else if (pci_controllers[i].mem_resources[2].flags & IORESOURCE_PREFETCH)
-			bus_address_hi =
-				pci_controllers[i].mem_resources[2].start >> 32;
-		else {
-			/* This is unlikely. */
-			pr_err("PCI: no memory resources on TRIO %d mac %d\n",
-				controller->trio_index, controller->mac);
-			continue;
-		}
-
-		/*
-		 * Alloc a PIO region for PCI memory access for each RC port.
-		 */
+		/* Alloc a PIO region for PCI memory access for each RC port. */
 		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
 		if (ret < 0) {
 			pr_err("PCI: MEM PIO alloc failure on TRIO %d mac %d, "
-				"give up\n", controller->trio_index,
-				controller->mac);
+			       "give up\n", controller->trio_index,
+			       controller->mac);
 
 			continue;
 		}
@@ -938,12 +946,45 @@ int __init pcibios_init(void)
 						    0);
 		if (ret < 0) {
 			pr_err("PCI: MEM PIO init failure on TRIO %d mac %d, "
-				"give up\n", controller->trio_index,
-				controller->mac);
+			       "give up\n", controller->trio_index,
+			       controller->mac);
 
 			continue;
 		}
 
+#ifdef CONFIG_TILE_PCI_IO
+		/*
+		 * Alloc a PIO region for PCI I/O space access for each RC port.
+		 */
+		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+
+		controller->pio_io_index = ret;
+
+		/*
+		 * For PIO IO, the bus_address_hi parameter is hard-coded 0
+		 * because PCI I/O address space is 32-bit.
+		 */
+		ret = gxio_trio_init_pio_region_aux(trio_context,
+						    controller->pio_io_index,
+						    controller->mac,
+						    0,
+						    HV_TRIO_PIO_FLAG_IO_SPACE);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+#endif
+
 		/*
 		 * Configure a Mem-Map region for each memory controller so
 		 * that Linux can map all of its PA space to the PCI bus.
@@ -958,9 +999,9 @@ int __init pcibios_init(void)
 							  0);
 			if (ret < 0) {
 				pr_err("PCI: Mem-Map alloc failure on TRIO %d "
-					"mac %d for MC %d, give up\n",
-					controller->trio_index,
-					controller->mac, j);
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
 
 				goto alloc_mem_map_failed;
 			}
@@ -991,9 +1032,9 @@ int __init pcibios_init(void)
 				GXIO_TRIO_ORDER_MODE_UNORDERED);
 			if (ret < 0) {
 				pr_err("PCI: Mem-Map init failure on TRIO %d "
-					"mac %d for MC %d, give up\n",
-					controller->trio_index,
-					controller->mac, j);
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
 
 				goto alloc_mem_map_failed;
 			}
@@ -1002,23 +1043,19 @@ int __init pcibios_init(void)
 alloc_mem_map_failed:
 			break;
 		}
-
 	}
 
 	return 0;
 }
 subsys_initcall(pcibios_init);
 
-/* Note: to be deleted after Linux 3.6 merge. */
+/* No bus fixups needed. */
 void pcibios_fixup_bus(struct pci_bus *bus)
 {
 }
 
-/*
- * This can be called from the generic PCI layer, but doesn't need to
- * do anything.
- */
-char *pcibios_setup(char *str)
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
 {
 	if (!strcmp(str, "off")) {
 		pci_probe = 0;
@@ -1029,8 +1066,7 @@ char *pcibios_setup(char *str)
 
 /*
  * Enable memory address decoding, as appropriate, for the
- * device described by the 'dev' struct. The I/O decoding
- * is disabled, though the TILE-Gx supports I/O addressing.
+ * device described by the 'dev' struct.
  *
  * This is called from the generic PCI layer, and can be called
  * for bridges or endpoints.
@@ -1040,13 +1076,24 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return pci_enable_resources(dev, mask);
 }
 
-/* Called for each device after PCI setup is done. */
+/*
+ * Called for each device after PCI setup is done.
+ * We initialize the PCI device capabilities conservatively, assuming that
+ * all devices can only address the 32-bit DMA space. The exception here is
+ * that the device dma_offset is set to the value that matches the 64-bit
+ * capable devices. This is OK because dma_offset is not used by legacy
+ * dma_ops, nor by the hybrid dma_ops's streaming DMAs, which are 64-bit ops.
+ * This implementation matches the kernel design of setting PCI devices'
+ * coherent_dma_mask to 0xffffffffull by default, allowing the device drivers
+ * to skip calling pci_set_consistent_dma_mask(DMA_BIT_MASK(32)).
+ */
 static void pcibios_fixup_final(struct pci_dev *pdev)
 {
-	set_dma_ops(&pdev->dev, gx_pci_dma_map_ops);
+	set_dma_ops(&pdev->dev, gx_legacy_pci_dma_map_ops);
 	set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET);
 	pdev->dev.archdata.max_direct_dma_addr =
 		TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
+	pdev->dev.coherent_dma_mask = TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final);
 
@@ -1060,19 +1107,15 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
 	resource_size_t start;
 	resource_size_t end;
 	int trio_fd;
-	int i, j;
+	int i;
 
 	start = phys_addr;
 	end = phys_addr + size - 1;
 
 	/*
-	 * In the following, each PCI controller's mem_resources[1]
-	 * represents its (non-prefetchable) PCI memory resource and
-	 * mem_resources[2] refers to its prefetchable PCI memory resource.
-	 * By searching phys_addr in each controller's mem_resources[], we can
+	 * By searching phys_addr in each controller's mem_space, we can
 	 * determine the controller that should accept the PCI memory access.
 	 */
-
 	for (i = 0; i < num_rc_controllers; i++) {
 		/*
 		 * Skip controllers that are not properly initialized or
@@ -1081,25 +1124,18 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
 		if (pci_controllers[i].root_bus == NULL)
 			continue;
 
-		for (j = 1; j < 3; j++) {
-			bar_start =
-				pci_controllers[i].mem_resources[j].start;
-			bar_end =
-				pci_controllers[i].mem_resources[j].end;
-
-			if ((start >= bar_start) && (end <= bar_end)) {
+		bar_start = pci_controllers[i].mem_space.start;
+		bar_end = pci_controllers[i].mem_space.end;
 
-				controller = &pci_controllers[i];
-
-				goto got_it;
-			}
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
 		}
 	}
 
 	if (controller == NULL)
 		return NULL;
 
-got_it:
 	trio_fd = controller->trio->fd;
 
 	/* Convert the resource start to the bus address offset. */
@@ -1107,14 +1143,71 @@ got_it:
 
 	offset = HV_TRIO_PIO_OFFSET(controller->pio_mem_index) + start;
 
-	/*
-	 * We need to keep the PCI bus address's in-page offset in the VA.
-	 */
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
 	return iorpc_ioremap(trio_fd, offset, size) +
-		(phys_addr & (PAGE_SIZE - 1));
+		(start & (PAGE_SIZE - 1));
 }
 EXPORT_SYMBOL(ioremap);
 
+#ifdef CONFIG_TILE_PCI_IO
+/* Map a PCI I/O address into VA space. */
+void __iomem *ioport_map(unsigned long port, unsigned int size)
+{
+	struct pci_controller *controller = NULL;
+	resource_size_t bar_start;
+	resource_size_t bar_end;
+	resource_size_t offset;
+	resource_size_t start;
+	resource_size_t end;
+	int trio_fd;
+	int i;
+
+	start = port;
+	end = port + size - 1;
+
+	/*
+	 * By searching the port in each controller's io_space, we can
+	 * determine the controller that should accept the PCI I/O access.
+	 */
+	for (i = 0; i < num_rc_controllers; i++) {
+		/*
+		 * Skip controllers that are not properly initialized or
+		 * have down links.
+		 */
+		if (pci_controllers[i].root_bus == NULL)
+			continue;
+
+		bar_start = pci_controllers[i].io_space.start;
+		bar_end = pci_controllers[i].io_space.end;
+
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
+		}
+	}
+
+	if (controller == NULL)
+		return NULL;
+
+	trio_fd = controller->trio->fd;
+
+	/* Convert the resource start to the bus address offset. */
+	port -= controller->io_space.start;
+
+	offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port;
+
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
+	return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1));
+}
+EXPORT_SYMBOL(ioport_map);
+
+void ioport_unmap(void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(ioport_unmap);
+#endif
+
 void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
 {
 	iounmap(addr);
@@ -1136,7 +1229,6 @@ EXPORT_SYMBOL(pci_iounmap);
  * offset is in bytes, from the start of config space for the
  * specified bus & device.
  */
-
 static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 			 int size, u32 *val)
 {
@@ -1186,7 +1278,6 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Accesses to the directly attached device have to be
 	 * sent as type-0 configs.
 	 */
-
 	if (busnum == (controller->first_busno + 1)) {
 		/*
 		 * There is only one device off of our built-in P2P bridge.
@@ -1208,9 +1299,8 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Note that we don't set the mac field in cfg_addr because the
 	 * mapping is per port.
 	 */
-
 	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
-			cfg_addr.word;
+		cfg_addr.word;
 
 valid_device:
 
@@ -1314,7 +1404,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Accesses to the directly attached device have to be
 	 * sent as type-0 configs.
 	 */
-
 	if (busnum == (controller->first_busno + 1)) {
 		/*
 		 * There is only one device off of our built-in P2P bridge.
@@ -1336,7 +1425,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Note that we don't set the mac field in cfg_addr because the
 	 * mapping is per port.
 	 */
-
 	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
 			cfg_addr.word;
 
@@ -1374,11 +1462,8 @@ static struct pci_ops tile_cfg_ops = {
 };
 
 
-/*
- * MSI support starts here.
- */
-static unsigned int
-tilegx_msi_startup(struct irq_data *d)
+/* MSI support starts here. */
+static unsigned int tilegx_msi_startup(struct irq_data *d)
 {
 	if (d->msi_desc)
 		unmask_msi_irq(d);
@@ -1386,21 +1471,18 @@ tilegx_msi_startup(struct irq_data *d)
 	return 0;
 }
 
-static void
-tilegx_msi_ack(struct irq_data *d)
+static void tilegx_msi_ack(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_msi_mask(struct irq_data *d)
+static void tilegx_msi_mask(struct irq_data *d)
 {
 	mask_msi_irq(d);
 	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_msi_unmask(struct irq_data *d)
+static void tilegx_msi_unmask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
 	unmask_msi_irq(d);
@@ -1457,32 +1539,55 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
 	trio_context = controller->trio;
 
 	/*
-	 * Allocate the Mem-Map that will accept the MSI write and
-	 * trigger the TILE-side interrupts.
+	 * Allocate a scatter-queue that will accept the MSI write and
+	 * trigger the TILE-side interrupts. We use the scatter-queue regions
+	 * before the mem map regions, because the latter are needed by more
+	 * applications.
 	 */
-	mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
-	if (mem_map < 0) {
-		dev_printk(KERN_INFO, &pdev->dev,
-			"%s Mem-Map alloc failure. "
-			"Failed to initialize MSI interrupts. "
-			"Falling back to legacy interrupts.\n",
-			desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+	mem_map = gxio_trio_alloc_scatter_queues(trio_context, 1, 0, 0);
+	if (mem_map >= 0) {
+		TRIO_MAP_SQ_DOORBELL_FMT_t doorbell_template = {{
+			.pop = 0,
+			.doorbell = 1,
+		}};
+
+		mem_map += TRIO_NUM_MAP_MEM_REGIONS;
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 8;
+		msg.data = (unsigned int)doorbell_template.word;
+	} else {
+		/* SQ regions are out, allocate from map mem regions. */
+		mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
+		if (mem_map < 0) {
+			dev_printk(KERN_INFO, &pdev->dev,
+				"%s Mem-Map alloc failure. "
+				"Failed to initialize MSI interrupts. "
+				"Falling back to legacy interrupts.\n",
+				desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+			ret = -ENOMEM;
+			goto msi_mem_map_alloc_failure;
+		}
 
-		ret = -ENOMEM;
-		goto msi_mem_map_alloc_failure;
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 -
+			TRIO_MAP_MEM_REG_INT0;
+
+		msg.data = mem_map;
 	}
 
 	/* We try to distribute different IRQs to different tiles. */
 	cpu = tile_irq_cpu(irq);
 
 	/*
-	 * Now call up to the HV to configure the Mem-Map interrupt and
+	 * Now call up to the HV to configure the MSI interrupt and
 	 * set up the IPI binding.
 	 */
-	mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
-		mem_map * MEM_MAP_INTR_REGION_SIZE;
-	mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
-
 	ret = gxio_trio_config_msi_intr(trio_context, cpu_x(cpu), cpu_y(cpu),
 					KERNEL_PL, irq, controller->mac,
 					mem_map, mem_map_base, mem_map_limit,
@@ -1495,13 +1600,9 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
 
 	irq_set_msi_desc(irq, desc);
 
-	msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 - TRIO_MAP_MEM_REG_INT0;
-
 	msg.address_hi = msi_addr >> 32;
 	msg.address_lo = msi_addr & 0xffffffff;
 
-	msg.data = mem_map;
-
 	write_msi_msg(irq, &msg);
 	irq_set_chip_and_handler(irq, &tilegx_msi_chip, handle_level_irq);
 	irq_set_handler_data(irq, controller);
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index dafc447..681100c 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -113,7 +113,6 @@ arch_initcall(proc_tile_init);
  * Support /proc/sys/tile directory
  */
 
-#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
 static ctl_table unaligned_subtable[] = {
 	{
 		.procname	= "enabled",
@@ -160,4 +159,3 @@ static int __init proc_sys_tile_init(void)
 }
 
 arch_initcall(proc_sys_tile_init);
-#endif
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 8ac3044..16ed589 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -33,6 +33,7 @@
 #include <asm/syscalls.h>
 #include <asm/traps.h>
 #include <asm/setup.h>
+#include <asm/uaccess.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -74,19 +75,6 @@ void arch_release_thread_info(struct thread_info *info)
 {
 	struct single_step_state *step_state = info->step_state;
 
-#ifdef CONFIG_HARDWALL
-	/*
-	 * We free a thread_info from the context of the task that has
-	 * been scheduled next, so the original task is already dead.
-	 * Calling deactivate here just frees up the data structures.
-	 * If the task we're freeing held the last reference to a
-	 * hardwall fd, it would have been released prior to this point
-	 * anyway via exit_files(), and the hardwall_task.info pointers
-	 * would be NULL by now.
-	 */
-	hardwall_deactivate_all(info->task);
-#endif
-
 	if (step_state) {
 
 		/*
@@ -160,6 +148,14 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	 */
 	task_thread_info(p)->step_state = NULL;
 
+#ifdef __tilegx__
+	/*
+	 * Do not clone unalign jit fixup from the parent; each thread
+	 * must allocate its own on demand.
+	 */
+	task_thread_info(p)->unalign_jit_base = NULL;
+#endif
+
 	/*
 	 * Copy the registers onto the kernel stack so the
 	 * return-from-interrupt code will reload it into registers.
@@ -191,16 +187,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	memset(&p->thread.dma_async_tlb, 0, sizeof(struct async_tlb));
 #endif
 
-#if CHIP_HAS_SN_PROC()
-	/* Likewise, the new thread is not running static processor code. */
-	p->thread.sn_proc_running = 0;
-	memset(&p->thread.sn_async_tlb, 0, sizeof(struct async_tlb));
-#endif
-
-#if CHIP_HAS_PROC_STATUS_SPR()
 	/* New thread has its miscellaneous processor state bits clear. */
 	p->thread.proc_status = 0;
-#endif
 
 #ifdef CONFIG_HARDWALL
 	/* New thread does not own any networks. */
@@ -218,19 +206,32 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	return 0;
 }
 
+int set_unalign_ctl(struct task_struct *tsk, unsigned int val)
+{
+	task_thread_info(tsk)->align_ctl = val;
+	return 0;
+}
+
+int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
+{
+	return put_user(task_thread_info(tsk)->align_ctl,
+			(unsigned int __user *)adr);
+}
+
+static struct task_struct corrupt_current = { .comm = "<corrupt>" };
+
 /*
  * Return "current" if it looks plausible, or else a pointer to a dummy.
  * This can be helpful if we are just trying to emit a clean panic.
  */
 struct task_struct *validate_current(void)
 {
-	static struct task_struct corrupt = { .comm = "<corrupt>" };
 	struct task_struct *tsk = current;
 	if (unlikely((unsigned long)tsk < PAGE_OFFSET ||
 		     (high_memory && (void *)tsk > high_memory) ||
 		     ((unsigned long)tsk & (__alignof__(*tsk) - 1)) != 0)) {
 		pr_err("Corrupt 'current' %p (sp %#lx)\n", tsk, stack_pointer);
-		tsk = &corrupt;
+		tsk = &corrupt_current;
 	}
 	return tsk;
 }
@@ -369,15 +370,11 @@ static void save_arch_state(struct thread_struct *t)
 	t->system_save[2] = __insn_mfspr(SPR_SYSTEM_SAVE_0_2);
 	t->system_save[3] = __insn_mfspr(SPR_SYSTEM_SAVE_0_3);
 	t->intctrl_0 = __insn_mfspr(SPR_INTCTRL_0_STATUS);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	t->proc_status = __insn_mfspr(SPR_PROC_STATUS);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	t->interrupt_vector_base = __insn_mfspr(SPR_INTERRUPT_VECTOR_BASE_0);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	t->tile_rtf_hwm = __insn_mfspr(SPR_TILE_RTF_HWM);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	t->dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
 #endif
@@ -398,15 +395,11 @@ static void restore_arch_state(const struct thread_struct *t)
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_2, t->system_save[2]);
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_3, t->system_save[3]);
 	__insn_mtspr(SPR_INTCTRL_0_STATUS, t->intctrl_0);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	__insn_mtspr(SPR_PROC_STATUS, t->proc_status);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	__insn_mtspr(SPR_INTERRUPT_VECTOR_BASE_0, t->interrupt_vector_base);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	__insn_mtspr(SPR_TILE_RTF_HWM, t->tile_rtf_hwm);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	__insn_mtspr(SPR_DSTREAM_PF, t->dstream_pf);
 #endif
@@ -415,26 +408,11 @@ static void restore_arch_state(const struct thread_struct *t)
 
 void _prepare_arch_switch(struct task_struct *next)
 {
-#if CHIP_HAS_SN_PROC()
-	int snctl;
-#endif
 #if CHIP_HAS_TILE_DMA()
 	struct tile_dma_state *dma = &current->thread.tile_dma_state;
 	if (dma->enabled)
 		save_tile_dma_state(dma);
 #endif
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Suspend the static network processor if it was running.
-	 * We do not suspend the fabric itself, just like we don't
-	 * try to suspend the UDN.
-	 */
-	snctl = __insn_mfspr(SPR_SNCTL);
-	current->thread.sn_proc_running =
-		(snctl & SPR_SNCTL__FRZPROC_MASK) == 0;
-	if (current->thread.sn_proc_running)
-		__insn_mtspr(SPR_SNCTL, snctl | SPR_SNCTL__FRZPROC_MASK);
-#endif
 }
 
 
@@ -462,17 +440,6 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 	/* Restore other arch state. */
 	restore_arch_state(&next->thread);
 
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Restart static network processor in the new process
-	 * if it was running before.
-	 */
-	if (next->thread.sn_proc_running) {
-		int snctl = __insn_mfspr(SPR_SNCTL);
-		__insn_mtspr(SPR_SNCTL, snctl & ~SPR_SNCTL__FRZPROC_MASK);
-	}
-#endif
-
 #ifdef CONFIG_HARDWALL
 	/* Enable or disable access to the network registers appropriately. */
 	hardwall_switch_tasks(prev, next);
@@ -514,7 +481,7 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 		schedule();
 		return 1;
 	}
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 	if (thread_info_flags & _TIF_ASYNC_TLB) {
 		do_async_page_fault(regs);
 		return 1;
@@ -564,7 +531,15 @@ void flush_thread(void)
  */
 void exit_thread(void)
 {
-	/* Nothing */
+#ifdef CONFIG_HARDWALL
+	/*
+	 * Remove the task from the list of tasks that are associated
+	 * with any live hardwalls.  (If the task that is exiting held
+	 * the last reference to a hardwall fd, it would already have
+	 * been released and deactivated at this point.)
+	 */
+	hardwall_deactivate_all(current);
+#endif
 }
 
 void show_regs(struct pt_regs *regs)
@@ -573,23 +548,24 @@ void show_regs(struct pt_regs *regs)
 	int i;
 
 	pr_err("\n");
-	show_regs_print_info(KERN_ERR);
+	if (tsk != &corrupt_current)
+		show_regs_print_info(KERN_ERR);
 #ifdef __tilegx__
-	for (i = 0; i < 51; i += 3)
+	for (i = 0; i < 17; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2]);
-	pr_err(" r51: "REGFMT" r52: "REGFMT" tp : "REGFMT"\n",
-	       regs->regs[51], regs->regs[52], regs->tp);
+		       i, regs->regs[i], i+18, regs->regs[i+18],
+		       i+36, regs->regs[i+36]);
+	pr_err(" r17: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+	       regs->regs[17], regs->regs[35], regs->tp);
 	pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
 #else
-	for (i = 0; i < 52; i += 4)
+	for (i = 0; i < 13; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT
 		       " r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2], i+3, regs->regs[i+3]);
-	pr_err(" r52: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
-	       regs->regs[52], regs->tp, regs->sp, regs->lr);
+		       i, regs->regs[i], i+14, regs->regs[i+14],
+		       i+27, regs->regs[i+27], i+40, regs->regs[i+40]);
+	pr_err(" r13: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
+	       regs->regs[13], regs->tp, regs->sp, regs->lr);
 #endif
 	pr_err(" pc : "REGFMT" ex1: %ld     faultnum: %ld\n",
 	       regs->pc, regs->ex1, regs->faultnum);
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index 0f83ed4..de98c6d 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -265,6 +265,21 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 
 void do_syscall_trace_exit(struct pt_regs *regs)
 {
+	long errno;
+
+	/*
+	 * The standard tile calling convention returns the value (or negative
+	 * errno) in r0, and zero (or positive errno) in r1.
+	 * It saves a couple of cycles on the hot path to do this work in
+	 * registers only as we return, rather than updating the in-memory
+	 * struct ptregs.
+	 */
+	errno = (long) regs->regs[0];
+	if (errno < 0 && errno > -4096)
+		regs->regs[1] = -errno;
+	else
+		regs->regs[1] = 0;
+
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, 0);
 
@@ -272,7 +287,7 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 		trace_sys_exit(regs, regs->regs[0]);
 }
 
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct siginfo info;
 
@@ -288,5 +303,5 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 /* Handle synthetic interrupt delivered only by the simulator. */
 void __kprobes do_breakpoint(struct pt_regs* regs, int fault_num)
 {
-	send_sigtrap(current, regs, fault_num);
+	send_sigtrap(current, regs);
 }
diff --git a/arch/tile/kernel/reboot.c b/arch/tile/kernel/reboot.c
index d1b5c91..6c5d2c0 100644
--- a/arch/tile/kernel/reboot.c
+++ b/arch/tile/kernel/reboot.c
@@ -27,7 +27,6 @@
 
 void machine_halt(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_halt();
@@ -35,7 +34,6 @@ void machine_halt(void)
 
 void machine_power_off(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_power_off();
diff --git a/arch/tile/kernel/regs_32.S b/arch/tile/kernel/regs_32.S
index c12280c..542cae1 100644
--- a/arch/tile/kernel/regs_32.S
+++ b/arch/tile/kernel/regs_32.S
@@ -20,7 +20,7 @@
 #include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 4)
 
diff --git a/arch/tile/kernel/regs_64.S b/arch/tile/kernel/regs_64.S
index 0829fd0..bbffcc6 100644
--- a/arch/tile/kernel/regs_64.S
+++ b/arch/tile/kernel/regs_64.S
@@ -20,7 +20,7 @@
 #include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 8)
 
diff --git a/arch/tile/kernel/relocate_kernel_32.S b/arch/tile/kernel/relocate_kernel_32.S
index 010b418..e44fbcf 100644
--- a/arch/tile/kernel/relocate_kernel_32.S
+++ b/arch/tile/kernel/relocate_kernel_32.S
@@ -20,15 +20,6 @@
 #include <asm/page.h>
 #include <hv/hypervisor.h>
 
-#define ___hvb	MEM_SV_INTRPT + HV_GLUE_START_CPA
-
-#define ___hv_dispatch(f) (___hvb + (HV_DISPATCH_ENTRY_SIZE * f))
-
-#define ___hv_console_putc ___hv_dispatch(HV_DISPATCH_CONSOLE_PUTC)
-#define ___hv_halt         ___hv_dispatch(HV_DISPATCH_HALT)
-#define ___hv_reexec       ___hv_dispatch(HV_DISPATCH_REEXEC)
-#define ___hv_flush_remote ___hv_dispatch(HV_DISPATCH_FLUSH_REMOTE)
-
 #undef RELOCATE_NEW_KERNEL_VERBOSE
 
 STD_ENTRY(relocate_new_kernel)
@@ -43,8 +34,8 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
-	moveli	r40, lo16(___hv_console_putc)
-	auli	r40, r40, ha16(___hv_console_putc)
+	moveli	r40, lo16(hv_console_putc)
+	auli	r40, r40, ha16(hv_console_putc)
 
 #ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
@@ -86,7 +77,6 @@ STD_ENTRY(relocate_new_kernel)
 	move	r30, sp
 	addi	sp, sp, -8
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/*
 	 * On TILEPro, we need to flush all tiles' caches, since we may
 	 * have been doing hash-for-home caching there.  Note that we
@@ -114,15 +104,14 @@ STD_ENTRY(relocate_new_kernel)
 	}
 	{
 	 move	r8, zero	 /* asids */
-	 moveli	r20, lo16(___hv_flush_remote)
+	 moveli	r20, lo16(hv_flush_remote)
 	}
 	{
 	 move	r9, zero	 /* asidcount */
-	 auli	r20, r20, ha16(___hv_flush_remote)
+	 auli	r20, r20, ha16(hv_flush_remote)
 	}
 
 	jalr	r20
-#endif
 
 	/* r33 is destination pointer, default to zero */
 
@@ -175,8 +164,8 @@ STD_ENTRY(relocate_new_kernel)
 	move	r0, r32
 	moveli	r1, 0		/* arg to hv_reexec is 64 bits */
 
-	moveli	r41, lo16(___hv_reexec)
-	auli	r41, r41, ha16(___hv_reexec)
+	moveli	r41, lo16(hv_reexec)
+	auli	r41, r41, ha16(hv_reexec)
 
 	jalr	r41
 
@@ -267,8 +256,8 @@ STD_ENTRY(relocate_new_kernel)
 	moveli	r0, '\n'
 	jalr	r40
 .Lhalt:
-	moveli	r41, lo16(___hv_halt)
-	auli	r41, r41, ha16(___hv_halt)
+	moveli	r41, lo16(hv_halt)
+	auli	r41, r41, ha16(hv_halt)
 
 	jalr	r41
 	STD_ENDPROC(relocate_new_kernel)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..d9d8cf6 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r40, hw2_last(hv_console_putc)
 	shl16insli r40, r40, hw1(hv_console_putc)
 	shl16insli r40, r40, hw0(hv_console_putc)
 
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
 	jalr	r40
 
@@ -78,7 +78,6 @@ STD_ENTRY(relocate_new_kernel)
 	move	r30, sp
 	addi	sp, sp, -16
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/*
 	 * On TILE-GX, we need to flush all tiles' caches, since we may
 	 * have been doing hash-for-home caching there.  Note that we
@@ -116,7 +115,6 @@ STD_ENTRY(relocate_new_kernel)
 	shl16insli	r20, r20, hw0(hv_flush_remote)
 
 	jalr	r20
-#endif
 
 	/* r33 is destination pointer, default to zero */
 
@@ -176,10 +174,12 @@ STD_ENTRY(relocate_new_kernel)
 
 	/* we should not get here */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, '?'
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 
 	j	.Lhalt
 
@@ -237,7 +237,9 @@ STD_ENTRY(relocate_new_kernel)
 	j	.Lloop
 
 
-.Lerr:	moveli	r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'e'
 	jalr	r40
 	moveli	r0, 'r'
 	jalr	r40
@@ -245,6 +247,7 @@ STD_ENTRY(relocate_new_kernel)
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 .Lhalt:
 	moveli r41, hw2_last(hv_halt)
 	shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index eceb834..4c34cae 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -154,6 +154,65 @@ static int __init setup_maxnodemem(char *str)
 }
 early_param("maxnodemem", setup_maxnodemem);
 
+struct memmap_entry {
+	u64 addr;	/* start of memory segment */
+	u64 size;	/* size of memory segment */
+};
+static struct memmap_entry memmap_map[64];
+static int memmap_nr;
+
+static void add_memmap_region(u64 addr, u64 size)
+{
+	if (memmap_nr >= ARRAY_SIZE(memmap_map)) {
+		pr_err("Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+	memmap_map[memmap_nr].addr = addr;
+	memmap_map[memmap_nr].size = size;
+	memmap_nr++;
+}
+
+static int __init setup_memmap(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+		pr_err("\"memmap=exactmap\" not valid on tile\n");
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	if (*p == '@') {
+		pr_err("\"memmap=nn@ss\" (force RAM) invalid on tile\n");
+	} else if (*p == '#') {
+		pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on tile\n");
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		add_memmap_region(start_at, mem_size);
+	} else {
+		if (mem_size == 0)
+			return -EINVAL;
+		maxmem_pfn = (mem_size >> HPAGE_SHIFT) <<
+			(HPAGE_SHIFT - PAGE_SHIFT);
+	}
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", setup_memmap);
+
+static int __init setup_mem(char *str)
+{
+	return setup_maxmem(str);
+}
+early_param("mem", setup_mem);  /* compatibility with x86 */
+
 static int __init setup_isolnodes(char *str)
 {
 	char buf[MAX_NUMNODES * 5];
@@ -209,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
 /*
  * Determine for each controller where its lowmem is mapped and how much of
  * it is mapped there.  On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
  * start our data mappings higher up, but for now we don't bother, to avoid
  * additional confusion.
  *
@@ -614,11 +673,12 @@ static void __init setup_bootmem_allocator_node(int i)
 	/*
 	 * Throw away any memory aliased by the PCI region.
 	 */
-	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
-		reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
-				PFN_PHYS(pci_reserve_end_pfn -
-					 pci_reserve_start_pfn),
+	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) {
+		start = max(pci_reserve_start_pfn, start);
+		end = min(pci_reserve_end_pfn, end);
+		reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start),
 				BOOTMEM_EXCLUSIVE);
+	}
 #endif
 }
 
@@ -628,6 +688,31 @@ static void __init setup_bootmem_allocator(void)
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		setup_bootmem_allocator_node(i);
 
+	/* Reserve any memory excluded by "memmap" arguments. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		reserve_bootmem(m->addr, m->size, 0);
+	}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (initrd_start) {
+		/* Make sure the initrd memory region is not modified. */
+		if (reserve_bootmem(initrd_start, initrd_end - initrd_start,
+				    BOOTMEM_EXCLUSIVE)) {
+			pr_crit("The initrd memory region has been polluted. Disabling it.\n");
+			initrd_start = 0;
+			initrd_end = 0;
+		} else {
+			/*
+			 * Translate initrd_start & initrd_end from PA to VA for
+			 * future access.
+			 */
+			initrd_start += PAGE_OFFSET;
+			initrd_end += PAGE_OFFSET;
+		}
+	}
+#endif
+
 #ifdef CONFIG_KEXEC
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0);
@@ -961,9 +1046,6 @@ void setup_cpu(int boot)
 	arch_local_irq_unmask(INT_DMATLB_MISS);
 	arch_local_irq_unmask(INT_DMATLB_ACCESS);
 #endif
-#if CHIP_HAS_SN_PROC()
-	arch_local_irq_unmask(INT_SNITLB_MISS);
-#endif
 #ifdef __tilegx__
 	arch_local_irq_unmask(INT_SINGLE_STEP_K);
 #endif
@@ -978,10 +1060,6 @@ void setup_cpu(int boot)
 	/* Static network is not restricted. */
 	__insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
 #endif
-#if CHIP_HAS_SN_PROC()
-	__insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
-	__insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
-#endif
 
 	/*
 	 * Set the MPL for interrupt control 0 & 1 to the corresponding
@@ -1029,6 +1107,10 @@ static void __init load_hv_initrd(void)
 	int fd, rc;
 	void *initrd;
 
+	/* If initrd has already been set, skip initramfs file in hvfs. */
+	if (initrd_start)
+		return;
+
 	fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
 	if (fd == HV_ENOENT) {
 		if (set_initramfs_file) {
@@ -1067,6 +1149,25 @@ void __init free_initrd_mem(unsigned long begin, unsigned long end)
 	free_bootmem(__pa(begin), end - begin);
 }
 
+static int __init setup_initrd(char *str)
+{
+	char *endp;
+	unsigned long initrd_size;
+
+	initrd_size = str ? simple_strtoul(str, &endp, 0) : 0;
+	if (initrd_size == 0 || *endp != '@')
+		return -EINVAL;
+
+	initrd_start = simple_strtoul(endp+1, &endp, 0);
+	if (initrd_start == 0)
+		return -EINVAL;
+
+	initrd_end = initrd_start + initrd_size;
+
+	return 0;
+}
+early_param("initrd", setup_initrd);
+
 #else
 static inline void load_hv_initrd(void) {}
 #endif /* CONFIG_BLK_DEV_INITRD */
@@ -1134,7 +1235,7 @@ static void __init validate_va(void)
 #ifndef __tilegx__   /* FIXME: GX: probably some validation relevant here */
 	/*
 	 * Similarly, make sure we're only using allowed VAs.
-	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
 	 * and 0 .. KERNEL_HIGH_VADDR.
 	 * In addition, make sure we CAN'T use the end of memory, since
 	 * we use the last chunk of each pgd for the pgd_list.
@@ -1149,7 +1250,7 @@ static void __init validate_va(void)
 		if (range.size == 0)
 			break;
 		if (range.start <= MEM_USER_INTRPT &&
-		    range.start + range.size >= MEM_HV_INTRPT)
+		    range.start + range.size >= MEM_HV_START)
 			user_kernel_ok = 1;
 		if (range.start == 0)
 			max_va = range.size;
@@ -1183,7 +1284,6 @@ static void __init validate_va(void)
 struct cpumask __write_once cpu_lotar_map;
 EXPORT_SYMBOL(cpu_lotar_map);
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /*
  * hash_for_home_map lists all the tiles that hash-for-home data
  * will be cached on.  Note that this may includes tiles that are not
@@ -1193,7 +1293,6 @@ EXPORT_SYMBOL(cpu_lotar_map);
  */
 struct cpumask hash_for_home_map;
 EXPORT_SYMBOL(hash_for_home_map);
-#endif
 
 /*
  * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
@@ -1286,7 +1385,6 @@ static void __init setup_cpu_maps(void)
 		cpu_lotar_map = *cpu_possible_mask;
 	}
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Retrieve set of CPUs used for hash-for-home caching */
 	rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE,
 			      (HV_VirtAddr) hash_for_home_map.bits,
@@ -1294,9 +1392,6 @@ static void __init setup_cpu_maps(void)
 	if (rc < 0)
 		early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
 	cpumask_or(&cpu_cacheable_map, cpu_possible_mask, &hash_for_home_map);
-#else
-	cpu_cacheable_map = *cpu_possible_mask;
-#endif
 }
 
 
@@ -1492,7 +1587,7 @@ void __init setup_per_cpu_areas(void)
 
 			/* Update the vmalloc mapping and page home. */
 			unsigned long addr = (unsigned long)ptr + i;
-			pte_t *ptep = virt_to_pte(NULL, addr);
+			pte_t *ptep = virt_to_kpte(addr);
 			pte_t pte = *ptep;
 			BUG_ON(pfn != pte_pfn(pte));
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
@@ -1501,12 +1596,12 @@ void __init setup_per_cpu_areas(void)
 
 			/* Update the lowmem mapping for consistency. */
 			lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
-			ptep = virt_to_pte(NULL, lowmem_va);
+			ptep = virt_to_kpte(lowmem_va);
 			if (pte_huge(*ptep)) {
 				printk(KERN_DEBUG "early shatter of huge page"
 				       " at %#lx\n", lowmem_va);
 				shatter_pmd((pmd_t *)ptep);
-				ptep = virt_to_pte(NULL, lowmem_va);
+				ptep = virt_to_kpte(lowmem_va);
 				BUG_ON(pte_huge(*ptep));
 			}
 			BUG_ON(pfn != pte_pfn(*ptep));
@@ -1548,6 +1643,8 @@ insert_non_bus_resource(void)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
+	if (!res)
+		return NULL;
 	res->name = "Non-Bus Physical Address Space";
 	res->start = (1ULL << 32);
 	res->end = -1LL;
@@ -1561,11 +1658,13 @@ insert_non_bus_resource(void)
 #endif
 
 static struct resource* __init
-insert_ram_resource(u64 start_pfn, u64 end_pfn)
+insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
-	res->name = "System RAM";
+	if (!res)
+		return NULL;
+	res->name = reserved ? "Reserved" : "System RAM";
 	res->start = start_pfn << PAGE_SHIFT;
 	res->end = (end_pfn << PAGE_SHIFT) - 1;
 	res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
@@ -1585,7 +1684,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn)
 static int __init request_standard_resources(void)
 {
 	int i;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
 #if defined(CONFIG_PCI) && !defined(__tilegx__)
 	insert_non_bus_resource();
@@ -1600,11 +1699,11 @@ static int __init request_standard_resources(void)
 		    end_pfn > pci_reserve_start_pfn) {
 			if (end_pfn > pci_reserve_end_pfn)
 				insert_ram_resource(pci_reserve_end_pfn,
-						     end_pfn);
+						    end_pfn, 0);
 			end_pfn = pci_reserve_start_pfn;
 		}
 #endif
-		insert_ram_resource(start_pfn, end_pfn);
+		insert_ram_resource(start_pfn, end_pfn, 0);
 	}
 
 	code_resource.start = __pa(_text - CODE_DELTA);
@@ -1615,6 +1714,13 @@ static int __init request_standard_resources(void)
 	insert_resource(&iomem_resource, &code_resource);
 	insert_resource(&iomem_resource, &data_resource);
 
+	/* Mark any "memmap" regions busy for the resource manager. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		insert_ram_resource(PFN_DOWN(m->addr),
+				    PFN_UP(m->addr + m->size - 1), 1);
+	}
+
 #ifdef CONFIG_KEXEC
 	insert_resource(&iomem_resource, &crashk_res);
 #endif
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 9531845b..2d1dbf3 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -33,6 +33,7 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
 #define DEBUG_SIG 0
@@ -190,7 +191,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
 
diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c
index 27742e8..de07fa7 100644
--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -12,41 +12,30 @@
  *   more details.
  *
  * A code-rewriter that enables instruction single-stepping.
- * Derived from iLib's single-stepping code.
  */
 
-#ifndef __tilegx__   /* Hardware support for single step unavailable. */
-
-/* These functions are only used on the TILE platform */
+#include <linux/smp.h>
+#include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
 #include <linux/types.h>
 #include <linux/err.h>
+#include <linux/prctl.h>
 #include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <arch/abi.h>
+#include <arch/spr_def.h>
 #include <arch/opcode.h>
 
-#define signExtend17(val) sign_extend((val), 17)
-#define TILE_X1_MASK (0xffffffffULL << 31)
-
-int unaligned_printk;
 
-static int __init setup_unaligned_printk(char *str)
-{
-	long val;
-	if (strict_strtol(str, 0, &val) != 0)
-		return 0;
-	unaligned_printk = val;
-	pr_info("Printk for each unaligned data accesses is %s\n",
-		unaligned_printk ? "enabled" : "disabled");
-	return 1;
-}
-__setup("unaligned_printk=", setup_unaligned_printk);
+#ifndef __tilegx__   /* Hardware support for single step unavailable. */
 
-unsigned int unaligned_fixup_count;
+#define signExtend17(val) sign_extend((val), 17)
+#define TILE_X1_MASK (0xffffffffULL << 31)
 
 enum mem_op {
 	MEMOP_NONE,
@@ -56,12 +45,13 @@ enum mem_op {
 	MEMOP_STORE_POSTINCR
 };
 
-static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
+static inline tilepro_bundle_bits set_BrOff_X1(tilepro_bundle_bits n,
+	s32 offset)
 {
-	tile_bundle_bits result;
+	tilepro_bundle_bits result;
 
 	/* mask out the old offset */
-	tile_bundle_bits mask = create_BrOff_X1(-1);
+	tilepro_bundle_bits mask = create_BrOff_X1(-1);
 	result = n & (~mask);
 
 	/* or in the new offset */
@@ -70,10 +60,11 @@ static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
 	return result;
 }
 
-static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
+static inline tilepro_bundle_bits move_X1(tilepro_bundle_bits n, int dest,
+	int src)
 {
-	tile_bundle_bits result;
-	tile_bundle_bits op;
+	tilepro_bundle_bits result;
+	tilepro_bundle_bits op;
 
 	result = n & (~TILE_X1_MASK);
 
@@ -87,13 +78,13 @@ static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
 	return result;
 }
 
-static inline tile_bundle_bits nop_X1(tile_bundle_bits n)
+static inline tilepro_bundle_bits nop_X1(tilepro_bundle_bits n)
 {
 	return move_X1(n, TREG_ZERO, TREG_ZERO);
 }
 
-static inline tile_bundle_bits addi_X1(
-	tile_bundle_bits n, int dest, int src, int imm)
+static inline tilepro_bundle_bits addi_X1(
+	tilepro_bundle_bits n, int dest, int src, int imm)
 {
 	n &= ~TILE_X1_MASK;
 
@@ -107,15 +98,26 @@ static inline tile_bundle_bits addi_X1(
 	return n;
 }
 
-static tile_bundle_bits rewrite_load_store_unaligned(
+static tilepro_bundle_bits rewrite_load_store_unaligned(
 	struct single_step_state *state,
-	tile_bundle_bits bundle,
+	tilepro_bundle_bits bundle,
 	struct pt_regs *regs,
 	enum mem_op mem_op,
 	int size, int sign_ext)
 {
 	unsigned char __user *addr;
 	int val_reg, addr_reg, err, val;
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	/* Get address and value registers */
 	if (bundle & TILEPRO_BUNDLE_Y_ENCODING_MASK) {
@@ -160,7 +162,7 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 	 * tilepro hardware would be doing, if it could provide us with the
 	 * actual bad address in an SPR, which it doesn't.
 	 */
-	if (unaligned_fixup == 0) {
+	if (align_ctl == 0) {
 		siginfo_t info = {
 			.si_signo = SIGBUS,
 			.si_code = BUS_ADRALN,
@@ -209,14 +211,14 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 
 	if (err) {
 		siginfo_t info = {
-			.si_signo = SIGSEGV,
-			.si_code = SEGV_MAPERR,
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
 			.si_addr = addr
 		};
-		trace_unhandled_signal("segfault", regs,
-				       (unsigned long)addr, SIGSEGV);
+		trace_unhandled_signal("bad address for unaligned fixup", regs,
+				       (unsigned long)addr, SIGBUS);
 		force_sig_info(info.si_signo, &info, current);
-		return (tile_bundle_bits) 0;
+		return (tilepro_bundle_bits) 0;
 	}
 
 	if (unaligned_printk || unaligned_fixup_count == 0) {
@@ -285,7 +287,7 @@ void single_step_execve(void)
 	ti->step_state = NULL;
 }
 
-/**
+/*
  * single_step_once() - entry point when single stepping has been triggered.
  * @regs: The machine register state
  *
@@ -304,20 +306,31 @@ void single_step_execve(void)
  */
 void single_step_once(struct pt_regs *regs)
 {
-	extern tile_bundle_bits __single_step_ill_insn;
-	extern tile_bundle_bits __single_step_j_insn;
-	extern tile_bundle_bits __single_step_addli_insn;
-	extern tile_bundle_bits __single_step_auli_insn;
+	extern tilepro_bundle_bits __single_step_ill_insn;
+	extern tilepro_bundle_bits __single_step_j_insn;
+	extern tilepro_bundle_bits __single_step_addli_insn;
+	extern tilepro_bundle_bits __single_step_auli_insn;
 	struct thread_info *info = (void *)current_thread_info();
 	struct single_step_state *state = info->step_state;
 	int is_single_step = test_ti_thread_flag(info, TIF_SINGLESTEP);
-	tile_bundle_bits __user *buffer, *pc;
-	tile_bundle_bits bundle;
+	tilepro_bundle_bits __user *buffer, *pc;
+	tilepro_bundle_bits bundle;
 	int temp_reg;
 	int target_reg = TREG_LR;
 	int err;
 	enum mem_op mem_op = MEMOP_NONE;
 	int size = 0, sign_ext = 0;  /* happy compiler */
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	asm(
 "    .pushsection .rodata.single_step\n"
@@ -390,7 +403,7 @@ void single_step_once(struct pt_regs *regs)
 	if (regs->faultnum == INT_SWINT_1)
 		regs->pc -= 8;
 
-	pc = (tile_bundle_bits __user *)(regs->pc);
+	pc = (tilepro_bundle_bits __user *)(regs->pc);
 	if (get_user(bundle, pc) != 0) {
 		pr_err("Couldn't read instruction at %p trying to step\n", pc);
 		return;
@@ -533,7 +546,6 @@ void single_step_once(struct pt_regs *regs)
 			}
 			break;
 
-#if CHIP_HAS_WH64()
 		/* postincrement operations */
 		case IMM_0_OPCODE_X1:
 			switch (get_ImmOpcodeExtension_X1(bundle)) {
@@ -568,7 +580,6 @@ void single_step_once(struct pt_regs *regs)
 				break;
 			}
 			break;
-#endif /* CHIP_HAS_WH64() */
 		}
 
 		if (state->update) {
@@ -627,9 +638,9 @@ void single_step_once(struct pt_regs *regs)
 
 	/*
 	 * Check if we need to rewrite an unaligned load/store.
-	 * Returning zero is a special value meaning we need to SIGSEGV.
+	 * Returning zero is a special value meaning we generated a signal.
 	 */
-	if (mem_op != MEMOP_NONE && unaligned_fixup >= 0) {
+	if (mem_op != MEMOP_NONE && align_ctl >= 0) {
 		bundle = rewrite_load_store_unaligned(state, bundle, regs,
 						      mem_op, size, sign_ext);
 		if (bundle == 0)
@@ -668,9 +679,9 @@ void single_step_once(struct pt_regs *regs)
 		}
 
 		/* End with a jump back to the next instruction */
-		delta = ((regs->pc + TILE_BUNDLE_SIZE_IN_BYTES) -
+		delta = ((regs->pc + TILEPRO_BUNDLE_SIZE_IN_BYTES) -
 			(unsigned long)buffer) >>
-			TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
+			TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
 		bundle = __single_step_j_insn;
 		bundle |= create_JOffLong_X1(delta);
 		err |= __put_user(bundle, buffer++);
@@ -698,9 +709,6 @@ void single_step_once(struct pt_regs *regs)
 }
 
 #else
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <arch/spr_def.h>
 
 static DEFINE_PER_CPU(unsigned long, ss_saved_pc);
 
@@ -743,10 +751,10 @@ void gx_singlestep_handle(struct pt_regs *regs, int fault_num)
 	} else if ((*ss_pc != regs->pc) ||
 		   (!(control & SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK))) {
 
-		ptrace_notify(SIGTRAP);
 		control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK;
 		control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK;
 		__insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control);
+		send_sigtrap(current, regs);
 	}
 }
 
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index cbc73a8..01e8ab2 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -20,8 +20,13 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <asm/cacheflush.h>
+#include <asm/homecache.h>
 
-HV_Topology smp_topology __write_once;
+/*
+ * We write to width and height with a single store in head_NN.S,
+ * so make the variable aligned to "long".
+ */
+HV_Topology smp_topology __write_once __aligned(sizeof(long));
 EXPORT_SYMBOL(smp_topology);
 
 #if CHIP_HAS_IPI()
@@ -100,8 +105,8 @@ static void smp_start_cpu_interrupt(void)
 /* Handler to stop the current cpu. */
 static void smp_stop_cpu_interrupt(void)
 {
-	set_cpu_online(smp_processor_id(), 0);
 	arch_local_irq_disable_all();
+	set_cpu_online(smp_processor_id(), 0);
 	for (;;)
 		asm("nap; nop");
 }
@@ -167,9 +172,16 @@ static void ipi_flush_icache_range(void *info)
 void flush_icache_range(unsigned long start, unsigned long end)
 {
 	struct ipi_flush flush = { start, end };
-	preempt_disable();
-	on_each_cpu(ipi_flush_icache_range, &flush, 1);
-	preempt_enable();
+
+	/* If invoked with irqs disabled, we can not issue IPIs. */
+	if (irqs_disabled())
+		flush_remote(0, HV_FLUSH_EVICT_L1I, NULL, 0, 0, 0,
+			NULL, NULL, 0);
+	else {
+		preempt_disable();
+		on_each_cpu(ipi_flush_icache_range, &flush, 1);
+		preempt_enable();
+	}
 }
 
 
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index a535655..732e9d1 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -142,13 +142,15 @@ static struct cpumask cpu_started;
  */
 static void start_secondary(void)
 {
-	int cpuid = smp_processor_id();
+	int cpuid;
+
+	preempt_disable();
+
+	cpuid = smp_processor_id();
 
 	/* Set our thread pointer appropriately. */
 	set_my_cpu_offset(__per_cpu_offset[cpuid]);
 
-	preempt_disable();
-
 	/*
 	 * In large machines even this will slow us down, since we
 	 * will be contending for for the printk spinlock.
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index af8dfc9..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -29,6 +29,7 @@
 #include <asm/switch_to.h>
 #include <asm/sigframe.h>
 #include <asm/stack.h>
+#include <asm/vdso.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 
@@ -102,9 +103,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	    p->sp >= sp) {
 		if (kbt->verbose)
 			pr_err("  <%s while in kernel mode>\n", fault);
-	} else if (EX1_PL(p->ex1) == USER_PL &&
-	    p->pc < PAGE_OFFSET &&
-	    p->sp < PAGE_OFFSET) {
+	} else if (user_mode(p) &&
+		   p->sp < PAGE_OFFSET && p->sp != 0) {
 		if (kbt->verbose)
 			pr_err("  <%s while in user mode>\n", fault);
 	} else if (kbt->verbose) {
@@ -120,7 +120,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 /* Is the pc pointing to a sigreturn trampoline? */
 static int is_sigreturn(unsigned long pc)
 {
-	return (pc == VDSO_BASE);
+	return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn));
 }
 
 /* Return a pt_regs pointer for a valid signal handler frame */
@@ -129,7 +129,7 @@ static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt,
 {
 	BacktraceIterator *b = &kbt->it;
 
-	if (b->pc == VDSO_BASE && b->sp < PAGE_OFFSET &&
+	if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET &&
 	    b->sp % sizeof(long) == 0) {
 		int retval;
 		pagefault_disable();
@@ -195,21 +195,21 @@ static int KBacktraceIterator_next_item_inclusive(
  */
 static void validate_stack(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	int cpu = raw_smp_processor_id();
 	unsigned long ksp0 = get_current_ksp0();
-	unsigned long ksp0_base = ksp0 - THREAD_SIZE;
+	unsigned long ksp0_base = ksp0 & -THREAD_SIZE;
 	unsigned long sp = stack_pointer;
 
 	if (EX1_PL(regs->ex1) == KERNEL_PL && regs->sp >= ksp0) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx underrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx underrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 
 	else if (sp < ksp0_base + sizeof(struct thread_info)) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx overrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx overrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 }
 
@@ -352,6 +352,26 @@ static void describe_addr(struct KBacktraceIterator *kbt,
 }
 
 /*
+ * Avoid possible crash recursion during backtrace.  If it happens, it
+ * makes it easy to lose the actual root cause of the failure, so we
+ * put a simple guard on all the backtrace loops.
+ */
+static bool start_backtrace(void)
+{
+	if (current->thread.in_backtrace) {
+		pr_err("Backtrace requested while in backtrace!\n");
+		return false;
+	}
+	current->thread.in_backtrace = true;
+	return true;
+}
+
+static void end_backtrace(void)
+{
+	current->thread.in_backtrace = false;
+}
+
+/*
  * This method wraps the backtracer's more generic support.
  * It is only invoked from the architecture-specific code; show_stack()
  * and dump_stack() (in entry.S) are architecture-independent entry points.
@@ -361,6 +381,8 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 	int i;
 	int have_mmap_sem = 0;
 
+	if (!start_backtrace())
+		return;
 	if (headers) {
 		/*
 		 * Add a blank line since if we are called from panic(),
@@ -371,7 +393,7 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Starting stack dump of tid %d, pid %d (%s)"
 		       " on cpu %d at cycle %lld\n",
 		       kbt->task->pid, kbt->task->tgid, kbt->task->comm,
-		       smp_processor_id(), get_cycles());
+		       raw_smp_processor_id(), get_cycles());
 	}
 	kbt->verbose = 1;
 	i = 0;
@@ -402,6 +424,7 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Stack dump complete\n");
 	if (have_mmap_sem)
 		up_read(&kbt->task->mm->mmap_sem);
+	end_backtrace();
 }
 EXPORT_SYMBOL(tile_show_stack);
 
@@ -463,6 +486,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 	int skip = trace->skip;
 	int i = 0;
 
+	if (!start_backtrace())
+		goto done;
 	if (task == NULL || task == current)
 		KBacktraceIterator_init_current(&kbt);
 	else
@@ -476,6 +501,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 			break;
 		trace->entries[i++] = kbt.it.pc;
 	}
+	end_backtrace();
+done:
 	trace->nr_entries = i;
 }
 EXPORT_SYMBOL(save_stack_trace_tsk);
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index b881a7be..38debe7 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -38,8 +38,10 @@
 SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, len,
 		unsigned long, flags)
 {
+	/* DCACHE is not particularly effective if not bound to one cpu. */
 	if (flags & DCACHE)
-		homecache_evict(cpumask_of(smp_processor_id()));
+		homecache_evict(cpumask_of(raw_smp_processor_id()));
+
 	if (flags & ICACHE)
 		flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(current->mm),
 			     0, 0, 0, NULL, NULL, 0);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..a3ed12f 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -157,6 +157,67 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
+static ssize_t hv_stats_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *page)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+			       (unsigned long)page, PAGE_SIZE - 1,
+			       lotar, 0);
+	n = n < 0 ? 0 : min(n, (ssize_t)PAGE_SIZE - 1);
+	page[n] = '\0';
+	return n;
+}
+
+static ssize_t hv_stats_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *page,
+			      size_t count)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, 0, 0, lotar, 1);
+	return n < 0 ? n : count;
+}
+
+static DEVICE_ATTR(hv_stats, 0644, hv_stats_show, hv_stats_store);
+
+static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif)
+{
+	int err, cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	err = sysfs_create_file(&dev->kobj, &dev_attr_hv_stats.attr);
+
+	return err;
+}
+
+static int hv_stats_device_remove(struct device *dev,
+				  struct subsys_interface *sif)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
+	return 0;
+}
+
+
+static struct subsys_interface hv_stats_interface = {
+	.name			= "hv_stats",
+	.subsys			= &cpu_subsys,
+	.add_dev		= hv_stats_device_add,
+	.remove_dev		= hv_stats_device_remove,
+};
+
 static int __init create_sysfs_entries(void)
 {
 	int err = 0;
@@ -188,6 +249,21 @@ static int __init create_sysfs_entries(void)
 		err = sysfs_create_bin_file(hypervisor_kobj, &hvconfig_bin);
 	}
 
+	if (!err) {
+		/*
+		 * Don't bother adding the hv_stats files on each CPU if
+		 * our hypervisor doesn't supply statistics.
+		 */
+		int cpu = raw_smp_processor_id();
+		long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+		char dummy;
+		ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+				       (unsigned long) &dummy, 1,
+				       lotar, 0);
+		if (n >= 0)
+			err = subsys_interface_register(&hv_stats_interface);
+	}
+
 	return err;
 }
 subsys_initcall(create_sysfs_entries);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 7c353d8..5d10642 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -23,8 +23,10 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/timekeeper_internal.h>
 #include <asm/irq_regs.h>
 #include <asm/traps.h>
+#include <asm/vdso.h>
 #include <hv/hypervisor.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
@@ -110,7 +112,6 @@ void __init time_init(void)
 	setup_tile_timer();
 }
 
-
 /*
  * Define the tile timer clock event device.  The timer is driven by
  * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
@@ -237,3 +238,37 @@ cycles_t ns2cycles(unsigned long nsecs)
 	struct clock_event_device *dev = &__raw_get_cpu_var(tile_timer);
 	return ((u64)nsecs * dev->mult) >> dev->shift;
 }
+
+void update_vsyscall_tz(void)
+{
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tz_update_count;
+	smp_wmb();
+	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+	smp_wmb();
+	++vdso_data->tz_update_count;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct timespec wall_time = tk_xtime(tk);
+	struct timespec *wtm = &tk->wall_to_monotonic;
+	struct clocksource *clock = tk->clock;
+
+	if (clock != &cycle_counter_cs)
+		return;
+
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tb_update_count;
+	smp_wmb();
+	vdso_data->xtime_tod_stamp = clock->cycle_last;
+	vdso_data->xtime_clock_sec = wall_time.tv_sec;
+	vdso_data->xtime_clock_nsec = wall_time.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
+	vdso_data->mult = clock->mult;
+	vdso_data->shift = clock->shift;
+	smp_wmb();
+	++vdso_data->tb_update_count;
+}
diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c
index 3fd54d5..f23b535 100644
--- a/arch/tile/kernel/tlb.c
+++ b/arch/tile/kernel/tlb.c
@@ -91,8 +91,14 @@ void flush_tlb_all(void)
 	}
 }
 
+/*
+ * Callers need to flush the L1I themselves if necessary, e.g. for
+ * kernel module unload.  Otherwise we assume callers are not using
+ * executable pgprot_t's.  Using EVICT_L1I means that dataplane cpus
+ * will get an unnecessary interrupt otherwise.
+ */
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	flush_remote(0, HV_FLUSH_EVICT_L1I, cpu_online_mask,
+	flush_remote(0, 0, NULL,
 		     start, end - start, PAGE_SIZE, cpu_online_mask, NULL, 0);
 }
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 5b19a23..6b603d5 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
+#include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
@@ -29,7 +30,7 @@
 
 void __init trap_init(void)
 {
-	/* Nothing needed here since we link code at .intrpt1 */
+	/* Nothing needed here since we link code at .intrpt */
 }
 
 int unaligned_fixup = 1;
@@ -100,13 +101,7 @@ static int retry_gpv(unsigned int gpv_reason)
 
 #endif /* CHIP_HAS_TILE_DMA() */
 
-#ifdef __tilegx__
-#define bundle_bits tilegx_bundle_bits
-#else
-#define bundle_bits tile_bundle_bits
-#endif
-
-extern bundle_bits bpt_code;
+extern tile_bundle_bits bpt_code;
 
 asm(".pushsection .rodata.bpt_code,\"a\";"
     ".align 8;"
@@ -114,7 +109,7 @@ asm(".pushsection .rodata.bpt_code,\"a\";"
     ".size bpt_code,.-bpt_code;"
     ".popsection");
 
-static int special_ill(bundle_bits bundle, int *sigp, int *codep)
+static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep)
 {
 	int sig, code, maxcode;
 
@@ -214,24 +209,73 @@ static const char *const int_name[] = {
 #endif
 };
 
+static int do_bpt(struct pt_regs *regs)
+{
+	unsigned long bundle, bcode, bpt;
+
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/*
+	 * bpt shoule be { bpt; nop }, which is 0x286a44ae51485000ULL.
+	 * we encode the unused least significant bits for other purpose.
+	 */
+	bpt = bundle & ~((1ULL << 12) - 1);
+	if (bpt != TILE_BPT_BUNDLE)
+		return 0;
+
+	bcode = bundle & ((1ULL << 12) - 1);
+	/*
+	 * notify the kprobe handlers, if instruction is likely to
+	 * pertain to them.
+	 */
+	switch (bcode) {
+	/* breakpoint_insn */
+	case 0:
+		notify_die(DIE_BREAK, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* compiled_bpt */
+	case DIE_COMPILED_BPT:
+		notify_die(DIE_COMPILED_BPT, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* breakpoint2_insn */
+	case DIE_SSTEPBP:
+		notify_die(DIE_SSTEPBP, "single_step", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
 void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		       unsigned long reason)
 {
 	siginfo_t info = { 0 };
 	int signo, code;
 	unsigned long address = 0;
-	bundle_bits instr;
+	tile_bundle_bits instr;
+	int is_kernel = !user_mode(regs);
+
+	/* Handle breakpoints, etc. */
+	if (is_kernel && fault_num == INT_ILL && do_bpt(regs))
+		return;
 
-	/* Re-enable interrupts. */
-	local_irq_enable();
+	/* Re-enable interrupts, if they were previously enabled. */
+	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+		local_irq_enable();
 
 	/*
 	 * If it hits in kernel mode and we can't fix it up, just exit the
 	 * current process and hope for the best.
 	 */
-	if (!user_mode(regs)) {
+	if (is_kernel) {
 		const char *name;
-		if (fixup_exception(regs))  /* only UNALIGN_DATA in practice */
+		char buf[100];
+		if (fixup_exception(regs))  /* ILL_TRANS or UNALIGN_DATA */
 			return;
 		if (fault_num >= 0 &&
 		    fault_num < sizeof(int_name)/sizeof(int_name[0]) &&
@@ -239,10 +283,16 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 			name = int_name[fault_num];
 		else
 			name = "Unknown interrupt";
-		pr_alert("Kernel took bad trap %d (%s) at PC %#lx\n",
-			 fault_num, name, regs->pc);
 		if (fault_num == INT_GPV)
-			pr_alert("GPV_REASON is %#lx\n", reason);
+			snprintf(buf, sizeof(buf), "; GPV_REASON %#lx", reason);
+#ifdef __tilegx__
+		else if (fault_num == INT_ILL_TRANS)
+			snprintf(buf, sizeof(buf), "; address %#lx", reason);
+#endif
+		else
+			buf[0] = '\0';
+		pr_alert("Kernel took bad trap %d (%s) at PC %#lx%s\n",
+			 fault_num, name, regs->pc, buf);
 		show_regs(regs);
 		do_exit(SIGKILL);  /* FIXME: implement i386 die() */
 		return;
@@ -324,11 +374,8 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		fill_ra_stack();
 
 		signo = SIGSEGV;
+		address = reason;
 		code = SEGV_MAPERR;
-		if (reason & SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK)
-			address = regs->pc;
-		else
-			address = 0;  /* FIXME: GX: single-step for address */
 		break;
 	}
 #endif
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
new file mode 100644
index 0000000..b425fb6
--- /dev/null
+++ b/arch/tile/kernel/unaligned.c
@@ -0,0 +1,1609 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * A code-rewriter that handles unaligned exception.
+ */
+
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/uaccess.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/prctl.h>
+#include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <arch/abi.h>
+#include <arch/spr_def.h>
+#include <arch/opcode.h>
+
+
+/*
+ * This file handles unaligned exception for tile-Gx. The tilepro's unaligned
+ * exception is supported out of single_step.c
+ */
+
+int unaligned_printk;
+
+static int __init setup_unaligned_printk(char *str)
+{
+	long val;
+	if (kstrtol(str, 0, &val) != 0)
+		return 0;
+	unaligned_printk = val;
+	pr_info("Printk for each unaligned data accesses is %s\n",
+		unaligned_printk ? "enabled" : "disabled");
+	return 1;
+}
+__setup("unaligned_printk=", setup_unaligned_printk);
+
+unsigned int unaligned_fixup_count;
+
+#ifdef __tilegx__
+
+/*
+ * Unalign data jit fixup code fragement. Reserved space is 128 bytes.
+ * The 1st 64-bit word saves fault PC address, 2nd word is the fault
+ * instruction bundle followed by 14 JIT bundles.
+ */
+
+struct unaligned_jit_fragment {
+	unsigned long       pc;
+	tilegx_bundle_bits  bundle;
+	tilegx_bundle_bits  insn[14];
+};
+
+/*
+ * Check if a nop or fnop at bundle's pipeline X0.
+ */
+
+static bool is_bundle_x0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X0(bundle) ==
+		  NOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)) ||
+		((get_UnaryOpcodeExtension_X0(bundle) ==
+		  FNOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline X1.
+ */
+
+static bool is_bundle_x1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X1(bundle) ==
+		  NOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)) ||
+		((get_UnaryOpcodeExtension_X1(bundle) ==
+		  FNOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)));
+}
+
+/*
+ * Check if nop or fnop at bundle's Y0 pipeline.
+ */
+
+static bool is_bundle_y0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  NOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)) ||
+		((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  FNOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline Y1.
+ */
+
+static bool is_bundle_y1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  NOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)) ||
+		((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  FNOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)));
+}
+
+/*
+ * Test if a bundle's y0 and y1 pipelines are both nop or fnop.
+ */
+
+static bool is_y0_y1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_y0_nop(bundle) && is_bundle_y1_nop(bundle);
+}
+
+/*
+ * Test if a bundle's x0 and x1 pipelines are both nop or fnop.
+ */
+
+static bool is_x0_x1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_x0_nop(bundle) && is_bundle_x1_nop(bundle);
+}
+
+/*
+ * Find the destination, source registers of fault unalign access instruction
+ * at X1 or Y2. Also, allocate up to 3 scratch registers clob1, clob2 and
+ * clob3, which are guaranteed different from any register used in the fault
+ * bundle. r_alias is used to return if the other instructions other than the
+ * unalign load/store shares same register with ra, rb and rd.
+ */
+
+static void find_regs(tilegx_bundle_bits bundle, uint64_t *rd, uint64_t *ra,
+		      uint64_t *rb, uint64_t *clob1, uint64_t *clob2,
+		      uint64_t *clob3, bool *r_alias)
+{
+	int i;
+	uint64_t reg;
+	uint64_t reg_map = 0, alias_reg_map = 0, map;
+	bool alias;
+
+	*ra = -1;
+	*rb = -1;
+
+	if (rd)
+		*rd = -1;
+
+	*clob1 = -1;
+	*clob2 = -1;
+	*clob3 = -1;
+	alias = false;
+
+	/*
+	 * Parse fault bundle, find potential used registers and mark
+	 * corresponding bits in reg_map and alias_map. These 2 bit maps
+	 * are used to find the scratch registers and determine if there
+	 * is register alais.
+	 */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {  /* Y Mode Bundle. */
+
+		reg = get_SrcA_Y2(bundle);
+		reg_map |= 1ULL << reg;
+		*ra = reg;
+		reg = get_SrcBDest_Y2(bundle);
+		reg_map |= 1ULL << reg;
+
+		if (rd) {
+			/* Load. */
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_y1_nop(bundle)) {
+			reg = get_SrcA_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+
+		if (!is_bundle_y0_nop(bundle)) {
+			reg = get_SrcA_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	} else	{ /* X Mode Bundle. */
+
+		reg = get_SrcA_X1(bundle);
+		reg_map |= (1ULL << reg);
+		*ra = reg;
+		if (rd)	{
+			/* Load. */
+			reg = get_Dest_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			reg = get_SrcB_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_x0_nop(bundle)) {
+			reg = get_SrcA_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	}
+
+	/*
+	 * "alias" indicates if the unalign access registers have collision
+	 * with others in the same bundle. We jsut simply test all register
+	 * operands case (RRR), ignored the case with immidate. If a bundle
+	 * has no register alias, we may do fixup in a simple or fast manner.
+	 * So if an immidata field happens to hit with a register, we may end
+	 * up fall back to the generic handling.
+	 */
+
+	*r_alias = alias;
+
+	/* Flip bits on reg_map. */
+	reg_map ^= -1ULL;
+
+	/* Scan reg_map lower 54(TREG_SP) bits to find 3 set bits. */
+	for (i = 0; i < TREG_SP; i++) {
+		if (reg_map & (0x1ULL << i)) {
+			if (*clob1 == -1) {
+				*clob1 = i;
+			} else if (*clob2 == -1) {
+				*clob2 = i;
+			} else if (*clob3 == -1) {
+				*clob3 = i;
+				return;
+			}
+		}
+	}
+}
+
+/*
+ * Sanity check for register ra, rb, rd, clob1/2/3. Return true if any of them
+ * is unexpected.
+ */
+
+static bool check_regs(uint64_t rd, uint64_t ra, uint64_t rb,
+		       uint64_t clob1, uint64_t clob2,  uint64_t clob3)
+{
+	bool unexpected = false;
+	if ((ra >= 56) && (ra != TREG_ZERO))
+		unexpected = true;
+
+	if ((clob1 >= 56) || (clob2 >= 56) || (clob3 >= 56))
+		unexpected = true;
+
+	if (rd != -1) {
+		if ((rd >= 56) && (rd != TREG_ZERO))
+			unexpected = true;
+	} else {
+		if ((rb >= 56) && (rb != TREG_ZERO))
+			unexpected = true;
+	}
+	return unexpected;
+}
+
+
+#define  GX_INSN_X0_MASK   ((1ULL << 31) - 1)
+#define  GX_INSN_X1_MASK   (((1ULL << 31) - 1) << 31)
+#define  GX_INSN_Y0_MASK   ((0xFULL << 27) | (0xFFFFFULL))
+#define  GX_INSN_Y1_MASK   (GX_INSN_Y0_MASK << 31)
+#define  GX_INSN_Y2_MASK   ((0x7FULL << 51) | (0x7FULL << 20))
+
+#ifdef __LITTLE_ENDIAN
+#define  GX_INSN_BSWAP(_bundle_)    (_bundle_)
+#else
+#define  GX_INSN_BSWAP(_bundle_)    swab64(_bundle_)
+#endif /* __LITTLE_ENDIAN */
+
+/*
+ * __JIT_CODE(.) creates template bundles in .rodata.unalign_data section.
+ * The corresponding static function jix_x#_###(.) generates partial or
+ * whole bundle based on the template and given arguments.
+ */
+
+#define __JIT_CODE(_X_)						\
+	asm (".pushsection .rodata.unalign_data, \"a\"\n"	\
+	     _X_"\n"						\
+	     ".popsection\n")
+
+__JIT_CODE("__unalign_jit_x1_mtspr:   {mtspr 0,  r0}");
+static tilegx_bundle_bits jit_x1_mtspr(int spr, int reg)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mtspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mtspr) & GX_INSN_X1_MASK) |
+		create_MT_Imm14_X1(spr) | create_SrcA_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x1_mfspr:   {mfspr r0, 0}");
+static tilegx_bundle_bits  jit_x1_mfspr(int reg, int spr)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mfspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mfspr) & GX_INSN_X1_MASK) |
+		create_MF_Imm14_X1(spr) | create_Dest_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x0_addi:   {addi  r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_addi) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_Imm8_X0(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ldna:   {ldna  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ldna(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ldna;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ldna) &  GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x0_dblalign:   {dblalign r0, r0 ,r0}");
+static tilegx_bundle_bits  jit_x0_dblalign(int rd, int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_dblalign;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_dblalign) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_SrcB_X0(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_iret:   {iret}");
+static tilegx_bundle_bits  jit_x1_iret(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_iret;
+	return GX_INSN_BSWAP(__unalign_jit_x1_iret) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x01_fnop:   {fnop;fnop}");
+static tilegx_bundle_bits  jit_x0_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X0_MASK;
+}
+
+static tilegx_bundle_bits  jit_x1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_y2_dummy:   {fnop; fnop; ld zero, sp}");
+static tilegx_bundle_bits  jit_y2_dummy(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y2_MASK;
+}
+
+static tilegx_bundle_bits  jit_y1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x1_st1_add:  {st1_add r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st1_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st1_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st1_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_st:  {crc32_8 r1, r0, r0; st  r0, r0}");
+static tilegx_bundle_bits  jit_x1_st(int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st) & GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_SrcB_X1(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_st_add:  {st_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld:  {crc32_8 r1, r0, r0; ld  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ld(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld_add:  {ld_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_ld_add(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld_add) &
+		(~create_Dest_X1(-1)) &
+		GX_INSN_X1_MASK) | create_Dest_X1(rd) |
+		create_SrcA_X1(ra) | create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfexts:  {bfexts r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfexts(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfexts;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfexts) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfextu:  {bfextu r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfextu(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfextu;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfextu) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x1_addi:  {bfextu r1, r1, 0, 0; addi r0, r0, 0}");
+static tilegx_bundle_bits  jit_x1_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_addi) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra) |
+		create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_shrui:  {shrui r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_shrui(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_shrui;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_shrui) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x0_rotli:  {rotli r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_rotli(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_rotli;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_rotli) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x1_bnezt:  {bnezt r0, __unalign_jit_x1_bnezt}");
+static tilegx_bundle_bits  jit_x1_bnezt(int ra, int broff)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_bnezt;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_bnezt) &
+		GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_BrOff_X1(broff);
+}
+
+#undef __JIT_CODE
+
+/*
+ * This function generates unalign fixup JIT.
+ *
+ * We fist find unalign load/store instruction's destination, source
+ * reguisters: ra, rb and rd. and 3 scratch registers by calling
+ * find_regs(...). 3 scratch clobbers should not alias with any register
+ * used in the fault bundle. Then analyze the fault bundle to determine
+ * if it's a load or store, operand width, branch or address increment etc.
+ * At last generated JIT is copied into JIT code area in user space.
+ */
+
+static
+void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle,
+		    int align_ctl)
+{
+	struct thread_info *info = current_thread_info();
+	struct unaligned_jit_fragment frag;
+	struct unaligned_jit_fragment *jit_code_area;
+	tilegx_bundle_bits bundle_2 = 0;
+	/* If bundle_2_enable = false, bundle_2 is fnop/nop operation. */
+	bool     bundle_2_enable = true;
+	uint64_t ra, rb, rd = -1, clob1, clob2, clob3;
+	/*
+	 * Indicate if the unalign access
+	 * instruction's registers hit with
+	 * others in the same bundle.
+	 */
+	bool     alias = false;
+	bool     load_n_store = true;
+	bool     load_store_signed = false;
+	unsigned int  load_store_size = 8;
+	bool     y1_br = false;  /* True, for a branch in same bundle at Y1.*/
+	int      y1_br_reg = 0;
+	/* True for link operation. i.e. jalr or lnk at Y1 */
+	bool     y1_lr = false;
+	int      y1_lr_reg = 0;
+	bool     x1_add = false;/* True, for load/store ADD instruction at X1*/
+	int      x1_add_imm8 = 0;
+	bool     unexpected = false;
+	int      n = 0, k;
+
+	jit_code_area =
+		(struct unaligned_jit_fragment *)(info->unalign_jit_base);
+
+	memset((void *)&frag, 0, sizeof(frag));
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		unsigned int mod, opcode;
+
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			/*
+			 * Test "jalr", "jalrp", "jr", "jrp" instruction at Y1
+			 * pipeline.
+			 */
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+				y1_lr = true;
+				y1_lr_reg = 55; /* Link register. */
+				/* FALLTHROUGH */
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				y1_br = true;
+				y1_br_reg = get_SrcA_Y1(bundle);
+				break;
+			case LNK_UNARY_OPCODE_Y1:
+				/* "lnk" at Y1 pipeline. */
+				y1_lr = true;
+				y1_lr_reg = get_Dest_Y1(bundle);
+				break;
+			}
+		}
+
+		opcode = get_Opcode_Y2(bundle);
+		mod = get_Mode(bundle);
+
+		/*
+		 *  bundle_2 is bundle after making Y2 as a dummy operation
+		 *  - ld zero, sp
+		 */
+		bundle_2 = (bundle & (~GX_INSN_Y2_MASK)) | jit_y2_dummy();
+
+		/* Make Y1 as fnop if Y1 is a branch or lnk operation. */
+		if (y1_br || y1_lr) {
+			bundle_2 &= ~(GX_INSN_Y1_MASK);
+			bundle_2 |= jit_y1_fnop();
+		}
+
+		if (is_y0_y1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (mod == MODE_OPCODE_YC2) {
+			/* Store. */
+			load_n_store = false;
+			load_store_size = 1 << opcode;
+			load_store_signed = false;
+			find_regs(bundle, 0, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+			if (load_store_size > 8)
+				unexpected = true;
+		} else {
+			/* Load. */
+			load_n_store = true;
+			if (mod == MODE_OPCODE_YB2) {
+				switch (opcode) {
+				case LD_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_OPCODE_Y2:
+					load_store_signed = true;
+					load_store_size = 4;
+					break;
+				case LD4U_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 4;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else if (mod == MODE_OPCODE_YA2) {
+				if (opcode == LD2S_OPCODE_Y2) {
+					load_store_signed = true;
+					load_store_size = 2;
+				} else if (opcode == LD2U_OPCODE_Y2) {
+					load_store_signed = false;
+					load_store_size = 2;
+				} else
+					unexpected = true;
+			} else
+				unexpected = true;
+			find_regs(bundle, &rd, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+		}
+	} else {
+		unsigned int opcode;
+
+		/* bundle_2 is bundle after making X1 as "fnop". */
+		bundle_2 = (bundle & (~GX_INSN_X1_MASK)) | jit_x1_fnop();
+
+		if (is_x0_x1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			if (get_RRROpcodeExtension_X1(bundle) ==
+			    UNARY_RRR_0_OPCODE_X1) {
+				load_n_store = true;
+				find_regs(bundle, &rd, &ra, &rb, &clob1,
+					  &clob2, &clob3, &alias);
+
+				switch (opcode) {
+				case LD_UNARY_OPCODE_X1:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD4U_UNARY_OPCODE_X1:
+					load_store_size = 4;
+					break;
+
+				case LD2S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD2U_UNARY_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else {
+				load_n_store = false;
+				load_store_signed = false;
+				find_regs(bundle, 0, &ra, &rb,
+					  &clob1, &clob2, &clob3,
+					  &alias);
+
+				opcode = get_RRROpcodeExtension_X1(bundle);
+				switch (opcode)	{
+				case ST_RRR_0_OPCODE_X1:
+					load_store_size = 8;
+					break;
+				case ST4_RRR_0_OPCODE_X1:
+					load_store_size = 4;
+					break;
+				case ST2_RRR_0_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			}
+		} else if (get_Opcode_X1(bundle) == IMM8_OPCODE_X1) {
+			load_n_store = true;
+			opcode = get_Imm8OpcodeExtension_X1(bundle);
+			switch (opcode)	{
+			case LD_ADD_IMM8_OPCODE_X1:
+				load_store_size = 8;
+				break;
+
+			case LD4S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD4U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 4;
+				break;
+
+			case LD2S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD2U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 2;
+				break;
+
+			case ST_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 8;
+				break;
+			case ST4_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 4;
+				break;
+			case ST2_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 2;
+				break;
+			default:
+				unexpected = true;
+			}
+
+			if (!unexpected) {
+				x1_add = true;
+				if (load_n_store)
+					x1_add_imm8 = get_Imm8_X1(bundle);
+				else
+					x1_add_imm8 = get_Dest_Imm8_X1(bundle);
+			}
+
+			find_regs(bundle, load_n_store ? (&rd) : NULL,
+				  &ra, &rb, &clob1, &clob2, &clob3, &alias);
+		} else
+			unexpected = true;
+	}
+
+	/*
+	 * Some sanity check for register numbers extracted from fault bundle.
+	 */
+	if (check_regs(rd, ra, rb, clob1, clob2, clob3) == true)
+		unexpected = true;
+
+	/* Give warning if register ra has an aligned address. */
+	if (!unexpected)
+		WARN_ON(!((load_store_size - 1) & (regs->regs[ra])));
+
+
+	/*
+	 * Fault came from kernel space, here we only need take care of
+	 * unaligned "get_user/put_user" macros defined in "uaccess.h".
+	 * Basically, we will handle bundle like this:
+	 * {ld/2u/4s rd, ra; movei rx, 0} or {st/2/4 ra, rb; movei rx, 0}
+	 * (Refer to file "arch/tile/include/asm/uaccess.h" for details).
+	 * For either load or store, byte-wise operation is performed by calling
+	 * get_user() or put_user(). If the macro returns non-zero value,
+	 * set the value to rx, otherwise set zero to rx. Finally make pc point
+	 * to next bundle and return.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		unsigned long rx = 0;
+		unsigned long x = 0, ret = 0;
+
+		if (y1_br || y1_lr || x1_add ||
+		    (load_store_signed !=
+		     (load_n_store && load_store_size == 4))) {
+			/* No branch, link, wrong sign-ext or load/store add. */
+			unexpected = true;
+		} else if (!unexpected) {
+			if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+				/*
+				 * Fault bundle is Y mode.
+				 * Check if the Y1 and Y0 is the form of
+				 * { movei rx, 0; nop/fnop }, if yes,
+				 * find the rx.
+				 */
+
+				if ((get_Opcode_Y1(bundle) == ADDI_OPCODE_Y1)
+				    && (get_SrcA_Y1(bundle) == TREG_ZERO) &&
+				    (get_Imm8_Y1(bundle) == 0) &&
+				    is_bundle_y0_nop(bundle)) {
+					rx = get_Dest_Y1(bundle);
+				} else if ((get_Opcode_Y0(bundle) ==
+					    ADDI_OPCODE_Y0) &&
+					   (get_SrcA_Y0(bundle) == TREG_ZERO) &&
+					   (get_Imm8_Y0(bundle) == 0) &&
+					   is_bundle_y1_nop(bundle)) {
+					rx = get_Dest_Y0(bundle);
+				} else {
+					unexpected = true;
+				}
+			} else {
+				/*
+				 * Fault bundle is X mode.
+				 * Check if the X0 is 'movei rx, 0',
+				 * if yes, find the rx.
+				 */
+
+				if ((get_Opcode_X0(bundle) == IMM8_OPCODE_X0)
+				    && (get_Imm8OpcodeExtension_X0(bundle) ==
+					ADDI_IMM8_OPCODE_X0) &&
+				    (get_SrcA_X0(bundle) == TREG_ZERO) &&
+				    (get_Imm8_X0(bundle) == 0)) {
+					rx = get_Dest_X0(bundle);
+				} else {
+					unexpected = true;
+				}
+			}
+
+			/* rx should be less than 56. */
+			if (!unexpected && (rx >= 56))
+				unexpected = true;
+		}
+
+		if (!search_exception_tables(regs->pc)) {
+			/* No fixup in the exception tables for the pc. */
+			unexpected = true;
+		}
+
+		if (unexpected) {
+			/* Unexpected unalign kernel fault. */
+			struct task_struct *tsk = validate_current();
+
+			bust_spinlocks(1);
+
+			show_regs(regs);
+
+			if (unlikely(tsk->pid < 2)) {
+				panic("Kernel unalign fault running %s!",
+				      tsk->pid ? "init" : "the idle task");
+			}
+#ifdef SUPPORT_DIE
+			die("Oops", regs);
+#endif
+			bust_spinlocks(1);
+
+			do_group_exit(SIGKILL);
+
+		} else {
+			unsigned long i, b = 0;
+			unsigned char *ptr =
+				(unsigned char *)regs->regs[ra];
+			if (load_n_store) {
+				/* handle get_user(x, ptr) */
+				for (i = 0; i < load_store_size; i++) {
+					ret = get_user(b, ptr++);
+					if (!ret) {
+						/* Success! update x. */
+#ifdef __LITTLE_ENDIAN
+						x |= (b << (8 * i));
+#else
+						x <<= 8;
+						x |= b;
+#endif /* __LITTLE_ENDIAN */
+					} else {
+						x = 0;
+						break;
+					}
+				}
+
+				/* Sign-extend 4-byte loads. */
+				if (load_store_size == 4)
+					x = (long)(int)x;
+
+				/* Set register rd. */
+				regs->regs[rd] = x;
+
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+
+			} else {
+				/* Handle put_user(x, ptr) */
+				x = regs->regs[rb];
+#ifdef __LITTLE_ENDIAN
+				b = x;
+#else
+				/*
+				 * Swap x in order to store x from low
+				 * to high memory same as the
+				 * little-endian case.
+				 */
+				switch (load_store_size) {
+				case 8:
+					b = swab64(x);
+					break;
+				case 4:
+					b = swab32(x);
+					break;
+				case 2:
+					b = swab16(x);
+					break;
+				}
+#endif /* __LITTLE_ENDIAN */
+				for (i = 0; i < load_store_size; i++) {
+					ret = put_user(b, ptr++);
+					if (ret)
+						break;
+					/* Success! shift 1 byte. */
+					b >>= 8;
+				}
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+			}
+		}
+
+		unaligned_fixup_count++;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d. Unalign fixup for kernel access "
+				"to userspace %lx.",
+				current->comm, current->pid, regs->regs[ra]);
+		}
+
+		/* Done! Return to the exception handler. */
+		return;
+	}
+
+	if ((align_ctl == 0) || unexpected) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+		if (unaligned_printk)
+			pr_info("Unalign bundle: unexp @%llx, %llx",
+				(unsigned long long)regs->pc,
+				(unsigned long long)bundle);
+
+		if (ra < 56) {
+			unsigned long uaa = (unsigned long)regs->regs[ra];
+			/* Set bus Address. */
+			info.si_addr = (unsigned char __user *)uaa;
+		}
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs,
+				       (unsigned long)info.si_addr, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+#ifdef __LITTLE_ENDIAN
+#define UA_FIXUP_ADDR_DELTA          1
+#define UA_FIXUP_BFEXT_START(_B_)    0
+#define UA_FIXUP_BFEXT_END(_B_)     (8 * (_B_) - 1)
+#else /* __BIG_ENDIAN */
+#define UA_FIXUP_ADDR_DELTA          -1
+#define UA_FIXUP_BFEXT_START(_B_)   (64 - 8 * (_B_))
+#define UA_FIXUP_BFEXT_END(_B_)      63
+#endif /* __LITTLE_ENDIAN */
+
+
+
+	if ((ra != rb) && (rd != TREG_SP) && !alias &&
+	    !y1_br && !y1_lr && !x1_add) {
+		/*
+		 * Simple case: ra != rb and no register alias found,
+		 * and no branch or link. This will be the majority.
+		 * We can do a little better for simplae case than the
+		 * generic scheme below.
+		 */
+		if (!load_n_store) {
+			/*
+			 * Simple store: ra != rb, no need for scratch register.
+			 * Just store and rotate to right bytewise.
+			 */
+#ifdef __BIG_ENDIAN
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, load_store_size - 1) |
+				jit_x1_fnop();
+#endif /* __BIG_ENDIAN */
+			for (k = 0; k < load_store_size; k++) {
+				/* Store a byte. */
+				frag.insn[n++] =
+					jit_x0_rotli(rb, rb, 56) |
+					jit_x1_st1_add(ra, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+#ifdef __BIG_ENDIAN
+			frag.insn[n] = jit_x1_addi(ra, ra, 1);
+#else
+			frag.insn[n] = jit_x1_addi(ra, ra,
+						   -1 * load_store_size);
+#endif /* __LITTLE_ENDIAN */
+
+			if (load_store_size == 8) {
+				frag.insn[n] |= jit_x0_fnop();
+			} else if (load_store_size == 4) {
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 32);
+			} else { /* = 2 */
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 16);
+			}
+			n++;
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+		} else {
+			if (rd == ra) {
+				/* Use two clobber registers: clob1/2. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st_add(TREG_SP, clob1, -8);
+				frag.insn[n++] =
+					jit_x0_addi(clob2, ra, 0) |
+					jit_x1_st(TREG_SP, clob2);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1/2 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, clob2) |
+					jit_x1_ld_add(clob2, TREG_SP, 8);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			} else {
+				/* Use one clobber register: clob1 only. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st(TREG_SP, clob1);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, ra) |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			}
+
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			/*
+			 * For non 8-byte load, extract corresponding bytes and
+			 * signed extension.
+			 */
+			if (load_store_size == 4) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+			} else if (load_store_size == 2) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+			}
+
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_iret();
+		}
+	} else if (!load_n_store) {
+
+		/*
+		 * Generic memory store cases: use 3 clobber registers.
+		 *
+		 * Alloc space for saveing clob2,1,3 on user's stack.
+		 * register clob3 points to where clob2 saved, followed by
+		 * clob1 and 3 from high to low memory.
+		 */
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32)    |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16)  |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+#ifdef __LITTLE_ENDIAN
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, 0)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#else
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, load_store_size - 1)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#endif
+		if (load_store_size == 8) {
+			/*
+			 * We save one byte a time, not for fast, but compact
+			 * code. After each store, data source register shift
+			 * right one byte. unchanged after 8 stores.
+			 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 7)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			frag.insn[n++] =
+				jit_x0_fnop()                 |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else if (load_store_size == 4) {
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 3)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			/*
+			 * same as 8-byte case, but need shift another 4
+			 * byte to recover rb for 4-byte store.
+			 */
+			frag.insn[n++] = jit_x0_rotli(rb, rb, 32)      |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else { /* =2 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, rb, 0)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			for (k = 0; k < 2; k++) {
+				frag.insn[n++] =
+					jit_x0_shrui(rb, rb, 8)  |
+					jit_x1_st1_add(clob1, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+			frag.insn[n++] =
+				jit_x0_addi(rb, clob2, 0)       |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob2);
+		}
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+		frag.insn[n++] =
+			jit_x0_fnop()   |
+			jit_x1_ld_add(clob1, clob3, -8);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_ld(clob3, clob3);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_iret();
+
+	} else {
+		/*
+		 * Generic memory load cases.
+		 *
+		 * Alloc space for saveing clob1,2,3 on user's stack.
+		 * register clob3 points to where clob1 saved, followed
+		 * by clob2 and 3 from high to low memory.
+		 */
+
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32) |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16) |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+		frag.insn[n++] =
+			jit_x0_addi(clob2, ra, 0) |
+			jit_x1_st_add(TREG_SP, clob2, 8);
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_addi(clob1, y1_br_reg, 0) |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob1);
+		}
+
+		frag.insn[n++] =
+			jit_x0_addi(clob1, clob2, 7)      |
+			jit_x1_ldna(rd, clob2);
+		frag.insn[n++] =
+			jit_x0_fnop()                     |
+			jit_x1_ldna(clob1, clob1);
+		frag.insn[n++] =
+			jit_x0_dblalign(rd, clob1, clob2) |
+			jit_x1_ld_add(clob1, clob3, -8);
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+
+		frag.insn[n++] =
+			jit_x0_fnop() |
+			jit_x1_ld(clob3, clob3);
+
+		if (load_store_size == 4) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+		} else if (load_store_size == 2) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+		}
+
+		frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+	}
+
+	/* Max JIT bundle count is 14. */
+	WARN_ON(n > 14);
+
+	if (!unexpected) {
+		int status = 0;
+		int idx = (regs->pc >> 3) &
+			((1ULL << (PAGE_SHIFT - UNALIGN_JIT_SHIFT)) - 1);
+
+		frag.pc = regs->pc;
+		frag.bundle = bundle;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d, Unalign fixup: pc=%lx "
+				"bundle=%lx %d %d %d %d %d %d %d %d.",
+				current->comm, current->pid,
+				(unsigned long)frag.pc,
+				(unsigned long)frag.bundle,
+				(int)alias, (int)rd, (int)ra,
+				(int)rb, (int)bundle_2_enable,
+				(int)y1_lr, (int)y1_br, (int)x1_add);
+
+			for (k = 0; k < n; k += 2)
+				pr_info("[%d] %016llx %016llx", k,
+					(unsigned long long)frag.insn[k],
+					(unsigned long long)frag.insn[k+1]);
+		}
+
+		/* Swap bundle byte order for big endian sys. */
+#ifdef __BIG_ENDIAN
+		frag.bundle = GX_INSN_BSWAP(frag.bundle);
+		for (k = 0; k < n; k++)
+			frag.insn[k] = GX_INSN_BSWAP(frag.insn[k]);
+#endif /* __BIG_ENDIAN */
+
+		status = copy_to_user((void __user *)&jit_code_area[idx],
+				      &frag, sizeof(frag));
+		if (status) {
+			/* Fail to copy JIT into user land. send SIGSEGV. */
+			siginfo_t info = {
+				.si_signo = SIGSEGV,
+				.si_code = SEGV_MAPERR,
+				.si_addr = (void __user *)&jit_code_area[idx]
+			};
+
+			pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx",
+				current->pid, current->comm,
+				(unsigned long long)&jit_code_area[idx]);
+
+			trace_unhandled_signal("segfault in unalign fixup",
+					       regs,
+					       (unsigned long)info.si_addr,
+					       SIGSEGV);
+			force_sig_info(info.si_signo, &info, current);
+			return;
+		}
+
+
+		/* Do a cheaper increment, not accurate. */
+		unaligned_fixup_count++;
+		__flush_icache_range((unsigned long)&jit_code_area[idx],
+				     (unsigned long)&jit_code_area[idx] +
+				     sizeof(frag));
+
+		/* Setup SPR_EX_CONTEXT_0_0/1 for returning to user program.*/
+		__insn_mtspr(SPR_EX_CONTEXT_0_0, regs->pc + 8);
+		__insn_mtspr(SPR_EX_CONTEXT_0_1, PL_ICS_EX1(USER_PL, 0));
+
+		/* Modify pc at the start of new JIT. */
+		regs->pc = (unsigned long)&jit_code_area[idx].insn[0];
+		/* Set ICS in SPR_EX_CONTEXT_K_1. */
+		regs->ex1 = PL_ICS_EX1(USER_PL, 1);
+	}
+}
+
+
+/*
+ * C function to generate unalign data JIT. Called from unalign data
+ * interrupt handler.
+ *
+ * First check if unalign fix is disabled or exception did not not come from
+ * user space or sp register points to unalign address, if true, generate a
+ * SIGBUS. Then map a page into user space as JIT area if it is not mapped
+ * yet. Genenerate JIT code by calling jit_bundle_gen(). After that return
+ * back to exception handler.
+ *
+ * The exception handler will "iret" to new generated JIT code after
+ * restoring caller saved registers. In theory, the JIT code will perform
+ * another "iret" to resume user's program.
+ */
+
+void do_unaligned(struct pt_regs *regs, int vecnum)
+{
+	tilegx_bundle_bits __user  *pc;
+	tilegx_bundle_bits bundle;
+	struct thread_info *info = current_thread_info();
+	int align_ctl;
+
+	/* Checks the per-process unaligned JIT flags */
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
+
+	/* Enable iterrupt in order to access user land. */
+	local_irq_enable();
+
+	/*
+	 * The fault came from kernel space. Two choices:
+	 * (a) unaligned_fixup < 1, we will first call get/put_user fixup
+	 *     to return -EFAULT. If no fixup, simply panic the kernel.
+	 * (b) unaligned_fixup >=1, we will try to fix the unaligned access
+	 *     if it was triggered by get_user/put_user() macros. Panic the
+	 *     kernel if it is not fixable.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		if (align_ctl < 1) {
+			unaligned_fixup_count++;
+			/* If exception came from kernel, try fix it up. */
+			if (fixup_exception(regs)) {
+				if (unaligned_printk)
+					pr_info("Unalign fixup: %d %llx @%llx",
+						(int)unaligned_fixup,
+						(unsigned long long)regs->ex1,
+						(unsigned long long)regs->pc);
+				return;
+			}
+			/* Not fixable. Go panic. */
+			panic("Unalign exception in Kernel. pc=%lx",
+			      regs->pc);
+			return;
+		} else {
+			/*
+			 * Try to fix the exception. If we can't, panic the
+			 * kernel.
+			 */
+			bundle = GX_INSN_BSWAP(
+				*((tilegx_bundle_bits *)(regs->pc)));
+			jit_bundle_gen(regs, bundle, align_ctl);
+			return;
+		}
+	}
+
+	/*
+	 * Fault came from user with ICS or stack is not aligned.
+	 * If so, we will trigger SIGBUS.
+	 */
+	if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+
+		if (unaligned_printk)
+			pr_info("Unalign fixup: %d %llx @%llx",
+				(int)unaligned_fixup,
+				(unsigned long long)regs->ex1,
+				(unsigned long long)regs->pc);
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs, 0, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+
+	/* Read the bundle casued the exception! */
+	pc = (tilegx_bundle_bits __user *)(regs->pc);
+	if (get_user(bundle, pc) != 0) {
+		/* Probably never be here since pc is valid user address.*/
+		siginfo_t info = {
+			.si_signo = SIGSEGV,
+			.si_code = SEGV_MAPERR,
+			.si_addr = (void __user *)pc
+		};
+		pr_err("Couldn't read instruction at %p trying to step\n", pc);
+		trace_unhandled_signal("segfault in unalign fixup", regs,
+				       (unsigned long)info.si_addr, SIGSEGV);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+	if (!info->unalign_jit_base) {
+		void __user *user_page;
+
+		/*
+		 * Allocate a page in userland.
+		 * For 64-bit processes we try to place the mapping far
+		 * from anything else that might be going on (specifically
+		 * 64 GB below the top of the user address space).  If it
+		 * happens not to be possible to put it there, it's OK;
+		 * the kernel will choose another location and we'll
+		 * remember it for later.
+		 */
+		if (is_compat_task())
+			user_page = NULL;
+		else
+			user_page = (void __user *)(TASK_SIZE - (1UL << 36)) +
+				(current->pid << PAGE_SHIFT);
+
+		user_page = (void __user *) vm_mmap(NULL,
+						    (unsigned long)user_page,
+						    PAGE_SIZE,
+						    PROT_EXEC | PROT_READ |
+						    PROT_WRITE,
+#ifdef CONFIG_HOMECACHE
+						    MAP_CACHE_HOME_TASK |
+#endif
+						    MAP_PRIVATE |
+						    MAP_ANONYMOUS,
+						    0);
+
+		if (IS_ERR((void __force *)user_page)) {
+			pr_err("Out of kernel pages trying do_mmap.\n");
+			return;
+		}
+
+		/* Save the address in the thread_info struct */
+		info->unalign_jit_base = user_page;
+		if (unaligned_printk)
+			pr_info("Unalign bundle: %d:%d, allocate page @%llx",
+				raw_smp_processor_id(), current->pid,
+				(unsigned long long)user_page);
+	}
+
+	/* Generate unalign JIT */
+	jit_bundle_gen(regs, GX_INSN_BSWAP(bundle), align_ctl);
+}
+
+#endif /* __tilegx__ */
diff --git a/arch/tile/kernel/vdso.c b/arch/tile/kernel/vdso.c
new file mode 100644
index 0000000..1533af2
--- /dev/null
+++ b/arch/tile/kernel/vdso.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#include <asm/vdso.h>
+#include <asm/mman.h>
+#include <asm/sections.h>
+
+#include <arch/sim.h>
+
+/* The alignment of the vDSO. */
+#define VDSO_ALIGNMENT  PAGE_SIZE
+
+
+static unsigned int vdso_pages;
+static struct page **vdso_pagelist;
+
+#ifdef CONFIG_COMPAT
+static unsigned int vdso32_pages;
+static struct page **vdso32_pagelist;
+#endif
+static int vdso_ready;
+
+/*
+ * The vdso data page.
+ */
+static union {
+	struct vdso_data	data;
+	u8			page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
+
+struct vdso_data *vdso_data = &vdso_data_store.data;
+
+static unsigned int __read_mostly vdso_enabled = 1;
+
+static struct page **vdso_setup(void *vdso_kbase, unsigned int pages)
+{
+	int i;
+	struct page **pagelist;
+
+	pagelist = kzalloc(sizeof(struct page *) * (pages + 1), GFP_KERNEL);
+	BUG_ON(pagelist == NULL);
+	for (i = 0; i < pages - 1; i++) {
+		struct page *pg = virt_to_page(vdso_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		pagelist[i] = pg;
+	}
+	pagelist[pages - 1] = virt_to_page(vdso_data);
+	pagelist[pages] = NULL;
+
+	return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+	int data_pages = sizeof(vdso_data_store) >> PAGE_SHIFT;
+
+	/*
+	 * We can disable vDSO support generally, but we need to retain
+	 * one page to support the two-bundle (16-byte) rt_sigreturn path.
+	 */
+	if (!vdso_enabled) {
+		size_t offset = (unsigned long)&__vdso_rt_sigreturn;
+		static struct page *sigret_page;
+		sigret_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		BUG_ON(sigret_page == NULL);
+		vdso_pagelist = &sigret_page;
+		vdso_pages = 1;
+		BUG_ON(offset >= PAGE_SIZE);
+		memcpy(page_address(sigret_page) + offset,
+		       vdso_start + offset, 16);
+#ifdef CONFIG_COMPAT
+		vdso32_pages = vdso_pages;
+		vdso32_pagelist = vdso_pagelist;
+#endif
+		vdso_ready = 1;
+		return 0;
+	}
+
+	vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
+	vdso_pages += data_pages;
+	vdso_pagelist = vdso_setup(vdso_start, vdso_pages);
+
+#ifdef CONFIG_COMPAT
+	vdso32_pages = (vdso32_end - vdso32_start) >> PAGE_SHIFT;
+	vdso32_pages += data_pages;
+	vdso32_pagelist = vdso_setup(vdso32_start, vdso32_pages);
+#endif
+
+	smp_wmb();
+	vdso_ready = 1;
+
+	return 0;
+}
+arch_initcall(vdso_init);
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == VDSO_BASE)
+		return "[vdso]";
+#ifndef __tilegx__
+	if (vma->vm_start == MEM_USER_INTRPT)
+		return "[intrpt]";
+#endif
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long address)
+{
+	return 0;
+}
+
+int in_gate_area_no_mm(unsigned long address)
+{
+	return 0;
+}
+
+int setup_vdso_pages(void)
+{
+	struct page **pagelist;
+	unsigned long pages;
+	struct mm_struct *mm = current->mm;
+	unsigned long vdso_base = 0;
+	int retval = 0;
+
+	if (!vdso_ready)
+		return 0;
+
+	mm->context.vdso_base = 0;
+
+	pagelist = vdso_pagelist;
+	pages = vdso_pages;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task()) {
+		pagelist = vdso32_pagelist;
+		pages = vdso32_pages;
+	}
+#endif
+
+	/*
+	 * vDSO has a problem and was disabled, just don't "enable" it for the
+	 * process.
+	 */
+	if (pages == 0)
+		return 0;
+
+	vdso_base = get_unmapped_area(NULL, vdso_base,
+				      (pages << PAGE_SHIFT) +
+				      ((VDSO_ALIGNMENT - 1) & PAGE_MASK),
+				      0, 0);
+	if (IS_ERR_VALUE(vdso_base)) {
+		retval = vdso_base;
+		return retval;
+	}
+
+	/* Add required alignment. */
+	vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
+
+	/*
+	 * Put vDSO base into mm struct. We need to do this before calling
+	 * install_special_mapping or the perf counter mmap tracking code
+	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
+	 */
+	mm->context.vdso_base = vdso_base;
+
+	/*
+	 * our vma flags don't have VM_WRITE so by default, the process isn't
+	 * allowed to write those pages.
+	 * gdb can break that with ptrace interface, and thus trigger COW on
+	 * those pages but it's then your responsibility to never do that on
+	 * the "data" page of the vDSO or you'll stop getting kernel updates
+	 * and your nice userland gettimeofday will be totally dead.
+	 * It's fine to use that for setting breakpoints in the vDSO code
+	 * pages though
+	 */
+	retval = install_special_mapping(mm, vdso_base,
+					 pages << PAGE_SHIFT,
+					 VM_READ|VM_EXEC |
+					 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
+					 pagelist);
+	if (retval)
+		mm->context.vdso_base = 0;
+
+	return retval;
+}
+
+static __init int vdso_func(char *s)
+{
+	return kstrtouint(s, 0, &vdso_enabled);
+}
+__setup("vdso=", vdso_func);
diff --git a/arch/tile/kernel/vdso/Makefile b/arch/tile/kernel/vdso/Makefile
new file mode 100644
index 0000000..e2b7a2f
--- /dev/null
+++ b/arch/tile/kernel/vdso/Makefile
@@ -0,0 +1,118 @@
+# Symbols present in the vdso
+vdso-syms = rt_sigreturn gettimeofday
+
+# Files to link into the vdso
+obj-vdso = $(patsubst %, v%.o, $(vdso-syms))
+
+# Build rules
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+
+# vdso32 is only for tilegx -m32 compat task.
+VDSO32-$(CONFIG_COMPAT) := y
+
+obj-y += vdso.o
+obj-$(VDSO32-y) += vdso32.o
+extra-y += vdso.lds
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+CFLAGS_REMOVE_vdso.o = -pg
+CFLAGS_REMOVE_vdso32.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn32.o = -pg
+CFLAGS_REMOVE_vgettimeofday.o = -pg
+CFLAGS_REMOVE_vgettimeofday32.o = -pg
+
+ifdef CONFIG_FEEDBACK_COLLECT
+# vDSO code runs in userspace, not collecting feedback data.
+CFLAGS_REMOVE_vdso.o = -ffeedback-generate
+CFLAGS_REMOVE_vdso32.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn32.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday32.o = -ffeedback-generate
+endif
+
+# Disable gcov profiling for VDSO code
+GCOV_PROFILE := n
+
+# Force dependency
+$(obj)/vdso.o: $(obj)/vdso.so
+
+# link rule for the .so file, .lds has to be first
+SYSCFLAGS_vdso.so.dbg = $(c_flags)
+$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso)
+	$(call if_changed,vdsold)
+
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+
+SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
+                            $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vdso_syms.o = -r
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(obj)/vrt_sigreturn.o FORCE
+	$(call if_changed,vdsold)
+
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# actual build commands
+# The DSO images are built using a special linker script
+# Add -lgcc so tilepro gets static muldi3 and lshrdi3 definitions.
+# Make sure only to export the intended __vdso_xxx symbol offsets.
+quiet_cmd_vdsold = VDSOLD  $@
+      cmd_vdsold = $(CC) $(KCFLAGS) -nostdlib $(SYSCFLAGS_$(@F)) \
+                           -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp -lgcc && \
+                   $(CROSS_COMPILE)objcopy \
+                           $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso.so: $(obj)/vdso.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	$(call cmd,vdso_install)
+
+vdso_install: vdso.so
+vdso32_install: vdso32.so
+
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m32 -s
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m32 -fPIC -shared
+
+obj-vdso32 = $(patsubst %, v%32.o, $(vdso-syms))
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+targets += $(obj-vdso32) vdso32.so vdso32.so.dbg
+
+$(obj-vdso32:%=%): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj-vdso32:%=%): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vgettimeofday32.o: $(obj)/vgettimeofday.c
+	$(call if_changed,cc_o_c)
+
+$(obj)/vrt_sigreturn32.o: $(obj)/vrt_sigreturn.S
+	$(call if_changed,as_o_S)
+
+# Force dependency
+$(obj)/vdso32.o: $(obj)/vdso32.so
+
+SYSCFLAGS_vdso32.so.dbg = -m32 -shared -s -Wl,-soname=linux-vdso32.so.1 \
+			    $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+$(obj)/vdso32.so.dbg: $(src)/vdso.lds $(obj-vdso32)
+	$(call if_changed,vdsold)
diff --git a/arch/tile/kernel/vdso/vdso.S b/arch/tile/kernel/vdso/vdso.S
new file mode 100644
index 0000000..3467adb
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso_start, vdso_end
+	.align PAGE_SIZE
+vdso_start:
+	.incbin "arch/tile/kernel/vdso/vdso.so"
+	.align PAGE_SIZE
+vdso_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vdso.lds.S b/arch/tile/kernel/vdso/vdso.lds.S
new file mode 100644
index 0000000..041cd6c
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.lds.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_VERSION_STRING	LINUX_2.6
+
+
+OUTPUT_ARCH(tile)
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__vdso_rt_sigreturn);
+
+
+SECTIONS
+{
+	. = SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+
+	.rodata	 : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+
+	/*
+	 * This linker script is used both with -r and with -shared.
+	 * For the layouts to match, we need to skip more than enough
+	 * space for the dynamic symbol table et al. If this amount
+	 * is insufficient, ld -shared will barf. Just increase it here.
+	 */
+	. = 0x1000;
+	.text		: { *(.text .text.*) }		:text
+
+	.data		: {
+		*(.got.plt) *(.got)
+		*(.data .data.* .gnu.linkonce.d.*)
+		*(.dynbss)
+		*(.bss .bss.* .gnu.linkonce.b.*)
+	}
+}
+
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD		FLAGS(5) FILEHDR PHDRS;	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC	FLAGS(4);		/* PF_R */
+	note		PT_NOTE		FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		__vdso_rt_sigreturn;
+		__vdso_gettimeofday;
+		gettimeofday;
+	local:*;
+	};
+}
diff --git a/arch/tile/kernel/vdso/vdso32.S b/arch/tile/kernel/vdso/vdso32.S
new file mode 100644
index 0000000..1d1ac32
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso32.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso32_start, vdso32_end
+	.align PAGE_SIZE
+vdso32_start:
+	.incbin "arch/tile/kernel/vdso/vdso32.so"
+	.align PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vgettimeofday.c b/arch/tile/kernel/vdso/vgettimeofday.c
new file mode 100644
index 0000000..51ec8e4
--- /dev/null
+++ b/arch/tile/kernel/vdso/vgettimeofday.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_BUILD  /* avoid some shift warnings for -m32 in <asm/page.h> */
+#include <linux/time.h>
+#include <asm/timex.h>
+#include <asm/vdso.h>
+
+#if CHIP_HAS_SPLIT_CYCLE()
+static inline cycles_t get_cycles_inline(void)
+{
+	unsigned int high = __insn_mfspr(SPR_CYCLE_HIGH);
+	unsigned int low = __insn_mfspr(SPR_CYCLE_LOW);
+	unsigned int high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+
+	while (unlikely(high != high2)) {
+		low = __insn_mfspr(SPR_CYCLE_LOW);
+		high = high2;
+		high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+	}
+
+	return (((cycles_t)high) << 32) | low;
+}
+#define get_cycles get_cycles_inline
+#endif
+
+/*
+ * Find out the vDSO data page address in the process address space.
+ */
+inline unsigned long get_datapage(void)
+{
+	unsigned long ret;
+
+	/* vdso data page located in the 2nd vDSO page. */
+	asm volatile ("lnk %0" : "=r"(ret));
+	ret &= ~(PAGE_SIZE - 1);
+	ret += PAGE_SIZE;
+
+	return ret;
+}
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	cycles_t cycles;
+	unsigned long count, sec, ns;
+	volatile struct vdso_data *vdso_data;
+
+	vdso_data = (struct vdso_data *)get_datapage();
+	/* The use of the timezone is obsolete, normally tz is NULL. */
+	if (unlikely(tz != NULL)) {
+		while (1) {
+			/* Spin until the update finish. */
+			count = vdso_data->tz_update_count;
+			if (count & 1)
+				continue;
+
+			tz->tz_minuteswest = vdso_data->tz_minuteswest;
+			tz->tz_dsttime = vdso_data->tz_dsttime;
+
+			/* Check whether updated, read again if so. */
+			if (count == vdso_data->tz_update_count)
+				break;
+		}
+	}
+
+	if (unlikely(tv == NULL))
+		return 0;
+
+	while (1) {
+		/* Spin until the update finish. */
+		count = vdso_data->tb_update_count;
+		if (count & 1)
+			continue;
+
+		cycles = (get_cycles() - vdso_data->xtime_tod_stamp);
+		ns = (cycles * vdso_data->mult) >> vdso_data->shift;
+		sec = vdso_data->xtime_clock_sec;
+		ns += vdso_data->xtime_clock_nsec;
+		if (ns >= NSEC_PER_SEC) {
+			ns -= NSEC_PER_SEC;
+			sec += 1;
+		}
+
+		/* Check whether updated, read again if so. */
+		if (count == vdso_data->tb_update_count)
+			break;
+	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = ns / 1000;
+
+	return 0;
+}
+
+int gettimeofday(struct timeval *tv, struct timezone *tz)
+	__attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/tile/kernel/vdso/vrt_sigreturn.S b/arch/tile/kernel/vdso/vrt_sigreturn.S
new file mode 100644
index 0000000..6326caf
--- /dev/null
+++ b/arch/tile/kernel/vdso/vrt_sigreturn.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <arch/abi.h>
+#include <asm/unistd.h>
+
+/*
+ * Note that libc has a copy of this function that it uses to compare
+ * against the PC when a stack backtrace ends, so if this code is
+ * changed, the libc implementation(s) should also be updated.
+ */
+ENTRY(__vdso_rt_sigreturn)
+	moveli TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn
+	swint1
+	/* We don't use ENDPROC to avoid tagging this symbol as FUNC,
+	 * which confuses the perf tool.
+	 */
+	END(__vdso_rt_sigreturn)
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index a13ed90..f1819423 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <hv/hypervisor.h>
 
 /* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
 
 OUTPUT_ARCH(tile)
 ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
 
 PHDRS
 {
-  intrpt1 PT_LOAD ;
+  intrpt PT_LOAD ;
   text PT_LOAD ;
   data PT_LOAD ;
 }
@@ -24,14 +24,17 @@ SECTIONS
   #define LOAD_OFFSET TEXT_OFFSET
 
   /* Interrupt vectors */
-  .intrpt1 (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
+  .intrpt (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
   {
     _text = .;
-    *(.intrpt1)
-  } :intrpt1 =0
+    *(.intrpt)
+  } :intrpt =0
 
   /* Hypervisor call vectors */
-  #include "hvglue.lds"
+  . = ALIGN(0x10000);
+  .hvglue : AT (ADDR(.hvglue) - LOAD_OFFSET) {
+    *(.hvglue)
+  } :NONE
 
   /* Now the real code */
   . = ALIGN(0x20000);
@@ -40,7 +43,11 @@ SECTIONS
     HEAD_TEXT
     SCHED_TEXT
     LOCK_TEXT
+    KPROBES_TEXT
+    IRQENTRY_TEXT
     __fix_text_end = .;   /* tile-cpack won't rearrange before this */
+    ALIGN_FUNCTION();
+    *(.hottext*)
     TEXT_TEXT
     *(.text.*)
     *(.coldtext*)
@@ -67,20 +74,8 @@ SECTIONS
   __init_end = .;
 
   _sdata = .;                   /* Start of data section */
-
   RO_DATA_SECTION(PAGE_SIZE)
-
-  /* initially writeable, then read-only */
-  . = ALIGN(PAGE_SIZE);
-  __w1data_begin = .;
-  .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) {
-    VMLINUX_SYMBOL(__w1data_begin) = .;
-    *(.w1data)
-    VMLINUX_SYMBOL(__w1data_end) = .;
-  }
-
   RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-
   _edata = .;
 
   EXCEPTION_TABLE(L2_CACHE_BYTES)
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 985f598..c4211cb 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -4,15 +4,15 @@
 
 lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
 	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-	strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-CFLAGS_REMOVE_memcpy_user_64.o = -fno-omit-frame-pointer
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
 
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
 lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
 
 obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index f5cada7..759efa3 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -20,50 +20,12 @@
 #include <linux/atomic.h>
 #include <arch/chip.h>
 
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
-	int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
-	__write_once = {
-	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* This page is remapped on startup to be hash-for-home. */
 int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 int *__atomic_hashed_lock(volatile void *v)
 {
 	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	unsigned long i =
-		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
-	unsigned long n = __insn_crc32_32(0, i);
-
-	/* Grab high bits for L1 index. */
-	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
-	/* Grab low bits for L2 index. */
-	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
-	return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
 	/*
 	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
 	 * Using mm works here because atomic_locks is page aligned.
@@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v)
 				      (unsigned long)atomic_locks,
 				      2, (ATOMIC_HASH_SHIFT + 2) - 1);
 	return (int *)ptr;
-#endif
 }
 
 #ifdef CONFIG_SMP
 /* Return whether the passed pointer is a valid atomic lock pointer. */
 static int is_atomic_lock(int *p)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	int i;
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
-		if (p >= &atomic_lock_ptr[i]->lock[0] &&
-		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
-			return 1;
-		}
-	}
-	return 0;
-#else
 	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
 }
 
 void __atomic_fault_unlock(int *irqlock_word)
@@ -110,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
 	return __atomic_hashed_lock(v);
 }
 
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
 {
-	return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+	return __atomic_xchg(v, __atomic_setup(v), n).val;
 }
 EXPORT_SYMBOL(_atomic_xchg);
 
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
 {
-	return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+	return __atomic_xchg_add(v, __atomic_setup(v), i).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add);
 
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
-		.val;
+	return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add_unless);
 
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
 {
-	return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+	return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
 }
 EXPORT_SYMBOL(_atomic_cmpxchg);
 
@@ -159,33 +107,32 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
 EXPORT_SYMBOL(_atomic_xor);
 
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+u64 _atomic64_xchg(u64 *v, u64 n)
 {
-	return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+	return __atomic64_xchg(v, __atomic_setup(v), n);
 }
 EXPORT_SYMBOL(_atomic64_xchg);
 
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+u64 _atomic64_xchg_add(u64 *v, u64 i)
 {
-	return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+	return __atomic64_xchg_add(v, __atomic_setup(v), i);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add);
 
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
-					  u, a);
+	return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add_unless);
 
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n)
 {
-	return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+	return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
 }
 EXPORT_SYMBOL(_atomic64_cmpxchg);
 
@@ -208,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
-	pr_warning("noatomichash is deprecated.\n");
-	return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
 void __init __init_atomic_per_cpu(void)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-	unsigned int i;
-	int actual_cpu;
-
-	/*
-	 * Before this is called from setup, we just have one lock for
-	 * all atomic objects/operations.  Here we replace the
-	 * elements of atomic_lock_ptr so that they point at per_cpu
-	 * integers.  This seemingly over-complex approach stems from
-	 * the fact that DEFINE_PER_CPU defines an entry for each cpu
-	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But
-	 * for efficient hashing of atomics to their locks we want a
-	 * compile time constant power of 2 for the size of this
-	 * table, so we use ATOMIC_HASH_SIZE.
-	 *
-	 * Here we populate atomic_lock_ptr from the per cpu
-	 * atomic_lock_pool, interspersing by actual cpu so that
-	 * subsequent elements are homed on consecutive cpus.
-	 */
-
-	actual_cpu = cpumask_first(cpu_possible_mask);
-
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-		/*
-		 * Preincrement to slightly bias against using cpu 0,
-		 * which has plenty of stuff homed on it already.
-		 */
-		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
-		if (actual_cpu >= nr_cpu_ids)
-			actual_cpu = cpumask_first(cpu_possible_mask);
-
-		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 	/* Validate power-of-two and "bigger than cpus" assumption */
 	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -279,6 +180,4 @@ void __init __init_atomic_per_cpu(void)
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 }
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 3063804..6bda313 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
 	STD_ENDPROC(__atomic\name)
 	.ifc \bitwidth,32
 	.pushsection __ex_table,"a"
+	.align  4
 	.word   1b, __atomic\name
 	.word   2b, __atomic\name
 	.word   __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 8f8ad81..9c0ec22 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -36,7 +36,8 @@ static inline void force_load(char *p)
  * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
  * until the memory controller holds the flushed values.
  */
-void finv_buffer_remote(void *buffer, size_t size, int hfh)
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
 {
 	char *p, *base;
 	size_t step_size, load_count;
@@ -147,18 +148,21 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 		force_load(p);
 
 	/*
-	 * Repeat, but with inv's instead of loads, to get rid of the
+	 * Repeat, but with finv's instead of loads, to get rid of the
 	 * data we just loaded into our own cache and the old home L3.
-	 * No need to unroll since inv's don't target a register.
+	 * No need to unroll since finv's don't target a register.
+	 * The finv's are guaranteed not to actually flush the data in
+	 * the buffer back to their home, since we just read it, so the
+	 * lines are clean in cache; we will only invalidate those lines.
 	 */
 	p = (char *)buffer + size - 1;
-	__insn_inv(p);
+	__insn_finv(p);
 	p -= step_size;
 	p = (char *)((unsigned long)p | (step_size - 1));
 	for (; p >= base; p -= step_size)
-		__insn_inv(p);
+		__insn_finv(p);
 
-	/* Wait for the load+inv's (and thus finvs) to have completed. */
+	/* Wait for these finv's (and thus the first finvs) to be done. */
 	__insn_mf();
 
 #ifdef __tilegx__
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index a93b02a..82733c8 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -22,7 +22,6 @@ EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(flush_user_asm);
-EXPORT_SYMBOL(inv_user_asm);
 EXPORT_SYMBOL(finv_user_asm);
 
 /* arch/tile/kernel/entry.S */
@@ -34,6 +33,12 @@ EXPORT_SYMBOL(dump_stack);
 /* arch/tile/kernel/head.S */
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
 /* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__copy_to_user_inatomic);
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
index 6f867db..f8196b3 100644
--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@@ -36,7 +36,7 @@ void *memchr(const void *s, int c, size_t n)
 	p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	goal = 0x0101010101010101ULL * (uint8_t) c;
+	goal = copy_byte(c);
 
 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6..a2771ae 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
 
 #include <linux/linkage.h>
 
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
 #define IS_MEMCPY	  0
 #define IS_COPY_FROM_USER  1
 #define IS_COPY_FROM_USER_ZEROING  2
@@ -44,6 +36,7 @@
  */
 #define EX \
 	.pushsection __ex_table, "a"; \
+	.align 4; \
 	.word 9f, memcpy_common_fixup; \
 	.popsection; \
 	9
@@ -158,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#if CHIP_HAS_WH64()
 	/* No need to prefetch dst, we'll just do the wh64
 	 * right before we copy a line.
 	 */
-#endif
-
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
-	/* Prefetch the dest */
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	/* Use a real load to cause a TLB miss if necessary.  We aren't using
-	 * r28, so this should be fine.
-	 */
-EX:	{ lw r28, r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bz zero, .Lbig_loop2 }
 
@@ -286,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
 	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#if CHIP_HAS_WH64()
 	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
-#else
-	/* Prefetch dest line */
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
 	/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
-	/* Back up the r9 to a cache line we are already storing to
-	 * if it gets past the end of the dest vector.  Strictly speaking,
-	 * we don't need to back up to the start of a cache line, but it's free
-	 * and tidy, so why not?
-	 */
-EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
 	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 
 .Ldest_is_word_aligned:
 
-#if CHIP_HAS_DWORD_ALIGN()
 EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
 	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 
@@ -511,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
 	/* Move r1 back to the point where it corresponds to r0. */
 	{ addi r1, r1, -4 }
 
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
-	/* Compute right/left shift counts and load initial source words. */
-	{ andi r5, r1, -4; andi r3, r1, 3 }
-EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
-	/* Load and store one word at a time, using shifts and ORs
-	 * to correct for the misaligned src.
-	 */
-.Lcopy_unaligned_src_loop:
-	{ shr r6, r6, r3; shl r8, r7, r4 }
-EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
-	{ addi r5, r5, 4; slti_u r8, r2, 8 }
-	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
-	{ bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
 	/* Fall through */
 
 /*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
 	.size memcpy_common_fixup, . - memcpy_common_fixup
 
 	.section __ex_table,"a"
+	.align 4
 	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
 	.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
index c79b8e7..4815354 100644
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@@ -18,14 +18,17 @@
 /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
 
 /* Must be 8 bytes in size. */
-#define word_t uint64_t
+#define op_t uint64_t
 
-#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
-#error "Assumes 64 or 128 byte line size"
+/* Threshold value for when to enter the unrolled loops. */
+#define	OP_T_THRES	16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
 #endif
 
 /* How many cache lines ahead should we prefetch? */
-#define PREFETCH_LINES_AHEAD 3
+#define PREFETCH_LINES_AHEAD 4
 
 /*
  * Provide "base versions" of load and store for the normal code path.
@@ -51,15 +54,16 @@ void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
  * macros to return a count of uncopied bytes due to mm fault.
  */
 #define RETVAL 0
-int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 #endif
 {
 	char *__restrict dst1 = (char *)dstv;
 	const char *__restrict src1 = (const char *)srcv;
 	const char *__restrict src1_end;
 	const char *__restrict prefetch;
-	word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
-	word_t final; /* Final bytes to write to trailing word, if any */
+	op_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+	op_t final; /* Final bytes to write to trailing word, if any */
 	long i;
 
 	if (n < 16) {
@@ -79,104 +83,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
 		__insn_prefetch(prefetch);
 		prefetch += CHIP_L2_LINE_SIZE();
-		prefetch = (prefetch > src1_end) ? prefetch : src1;
+		prefetch = (prefetch < src1_end) ? prefetch : src1;
 	}
 
 	/* Copy bytes until dst is word-aligned. */
-	for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+	for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
 		ST1(dst1++, LD1(src1++));
 
 	/* 8-byte pointer to destination memory. */
-	dst8 = (word_t *)dst1;
-
-	if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
-		/*
-		 * Misaligned copy.  Copy 8 bytes at a time, but don't
-		 * bother with other fanciness.
-		 *
-		 * TODO: Consider prefetching and using wh64 as well.
-		 */
-
-		/* Create an aligned src8. */
-		const word_t *__restrict src8 =
-			(const word_t *)((uintptr_t)src1 & -sizeof(word_t));
-		word_t b;
-
-		word_t a = LD8(src8++);
-		for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
-			b = LD8(src8++);
-			a = __insn_dblalign(a, b, src1);
-			ST8(dst8++, a);
-			a = b;
+	dst8 = (op_t *)dst1;
+
+	if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+		/* Unaligned copy. */
+
+		op_t  tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+		const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+						   -sizeof(op_t));
+		const void *srci = (void *)src1;
+		int m;
+
+		m = (CHIP_L2_LINE_SIZE() << 2) -
+			(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+		m = (n < m) ? n : m;
+		m /= sizeof(op_t);
+
+		/* Copy until 'dst' is cache-line-aligned. */
+		n -= (sizeof(op_t) * m);
+
+		switch (m % 4) {
+		case 0:
+			if (__builtin_expect(!m, 0))
+				goto _M0;
+			tmp1 = LD8(src8++);
+			tmp2 = LD8(src8++);
+			goto _8B3;
+		case 2:
+			m += 2;
+			tmp3 = LD8(src8++);
+			tmp0 = LD8(src8++);
+			goto _8B1;
+		case 3:
+			m += 1;
+			tmp2 = LD8(src8++);
+			tmp3 = LD8(src8++);
+			goto _8B2;
+		case 1:
+			m--;
+			tmp0 = LD8(src8++);
+			tmp1 = LD8(src8++);
+			if (__builtin_expect(!m, 0))
+				goto _8B0;
+		}
+
+		do {
+			tmp2 = LD8(src8++);
+			tmp0 =  __insn_dblalign(tmp0, tmp1, srci);
+			ST8(dst8++, tmp0);
+_8B3:
+			tmp3 = LD8(src8++);
+			tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+			ST8(dst8++, tmp1);
+_8B2:
+			tmp0 = LD8(src8++);
+			tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+			ST8(dst8++, tmp2);
+_8B1:
+			tmp1 = LD8(src8++);
+			tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+			ST8(dst8++, tmp3);
+			m -= 4;
+		} while (m);
+
+_8B0:
+		tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+		ST8(dst8++, tmp0);
+		src8--;
+
+_M0:
+		if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+			op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+			prefetch = ((const char *)src8) +
+				CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+			for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+			     n -= CHIP_L2_LINE_SIZE()) {
+				/* Prefetch and advance to next line to
+				   prefetch, but don't go past the end.  */
+				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch < src1_end) ? prefetch :
+					(const char *) src8;
+
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+				tmp8 = LD8(src8++);
+
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+				tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+				tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+				tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+				tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+				tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+				tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+				__insn_wh64(dst8);
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				tmp0 = tmp8;
+			}
+			src8--;
+		}
+
+		/* Copy the rest 8-byte chunks. */
+		if (n >= sizeof(op_t)) {
+			tmp0 = LD8(src8++);
+			for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+				tmp1 = LD8(src8++);
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				ST8(dst8++, tmp0);
+				tmp0 = tmp1;
+			}
+			src8--;
 		}
 
 		if (n == 0)
 			return RETVAL;
 
-		b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+		tmp0 = LD8(src8++);
+		tmp1 = ((const char *)src8 <= src1_end)
+			? LD8((op_t *)src8) : 0;
+		final = __insn_dblalign(tmp0, tmp1, srci);
 
-		/*
-		 * Final source bytes to write to trailing partial
-		 * word, if any.
-		 */
-		final = __insn_dblalign(a, b, src1);
 	} else {
 		/* Aligned copy. */
 
-		const word_t* __restrict src8 = (const word_t *)src1;
+		const op_t *__restrict src8 = (const op_t *)src1;
 
 		/* src8 and dst8 are both word-aligned. */
 		if (n >= CHIP_L2_LINE_SIZE()) {
 			/* Copy until 'dst' is cache-line-aligned. */
 			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
-			     n -= sizeof(word_t))
+			     n -= sizeof(op_t))
 				ST8(dst8++, LD8(src8++));
 
 			for (; n >= CHIP_L2_LINE_SIZE(); ) {
-				__insn_wh64(dst8);
+				op_t tmp0, tmp1, tmp2, tmp3;
+				op_t tmp4, tmp5, tmp6, tmp7;
 
 				/*
 				 * Prefetch and advance to next line
-				 * to prefetch, but don't go past the end
+				 * to prefetch, but don't go past the
+				 * end.
 				 */
 				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
 				prefetch += CHIP_L2_LINE_SIZE();
-				prefetch = (prefetch > src1_end) ? prefetch :
+				prefetch = (prefetch < src1_end) ? prefetch :
 					(const char *)src8;
 
 				/*
-				 * Copy an entire cache line.  Manually
-				 * unrolled to avoid idiosyncracies of
-				 * compiler unrolling.
+				 * Do all the loads before wh64.  This
+				 * is necessary if [src8, src8+7] and
+				 * [dst8, dst8+7] share the same cache
+				 * line and dst8 <= src8, as can be
+				 * the case when called from memmove,
+				 * or with code tested on x86 whose
+				 * memcpy always works with forward
+				 * copies.
 				 */
-#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
-				COPY_WORD(0);
-				COPY_WORD(1);
-				COPY_WORD(2);
-				COPY_WORD(3);
-				COPY_WORD(4);
-				COPY_WORD(5);
-				COPY_WORD(6);
-				COPY_WORD(7);
-#if CHIP_L2_LINE_SIZE() == 128
-				COPY_WORD(8);
-				COPY_WORD(9);
-				COPY_WORD(10);
-				COPY_WORD(11);
-				COPY_WORD(12);
-				COPY_WORD(13);
-				COPY_WORD(14);
-				COPY_WORD(15);
-#elif CHIP_L2_LINE_SIZE() != 64
-# error Fix code that assumes particular L2 cache line sizes
-#endif
+				tmp0 = LD8(src8++);
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+
+				/* wh64 and wait for tmp7 load completion. */
+				__asm__ ("move %0, %0; wh64 %1\n"
+					 : : "r"(tmp7), "r"(dst8));
 
-				dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
-				src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				n -= CHIP_L2_LINE_SIZE();
 			}
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
 		}
 
-		for (; n >= sizeof(word_t); n -= sizeof(word_t))
+		for (; n >= sizeof(op_t); n -= sizeof(op_t))
 			ST8(dst8++, LD8(src8++));
 
 		if (__builtin_expect(n == 0, 1))
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index 3bc4b4e..0000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
-	void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
-	void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
-	void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
-  __insn_mtspr(SPR_SIM_CONTROL, \
-   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache.  (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
-			      pte_t dst_pte, pte_t src_pte, int len)
-{
-	int idx;
-	unsigned long flags, newsrc, newdst;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int type0, type1;
-	int cpu = get_cpu();
-
-	/*
-	 * Disable interrupts so that we don't recurse into memcpy()
-	 * in an interrupt handler, nor accidentally reference
-	 * the PA of the source from an interrupt routine.  Also
-	 * notify the simulator that we're playing games so we don't
-	 * generate spurious coherency warnings.
-	 */
-	local_irq_save(flags);
-	sim_allow_multiple_caching(1);
-
-	/* Set up the new dest mapping */
-	type0 = kmap_atomic_idx_push();
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
-	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
-	ptep = pte_offset_kernel(pmdp, newdst);
-	if (pte_val(*ptep) != pte_val(dst_pte)) {
-		set_pte(ptep, dst_pte);
-		local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
-	}
-
-	/* Set up the new source mapping */
-	type1 = kmap_atomic_idx_push();
-	idx += (type0 - type1);
-	src_pte = hv_pte_set_nc(src_pte);
-	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
-	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
-	ptep = pte_offset_kernel(pmdp, newsrc);
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/* Actually move the data. */
-	__memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
-	/*
-	 * Remap the source as locally-cached and not OLOC'ed so that
-	 * we can inval without also invaling the remote cpu's cache.
-	 * This also avoids known errata with inv'ing cacheable oloc data.
-	 */
-	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
-	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/*
-	 * Do the actual invalidation, covering the full L2 cache line
-	 * at the end since __memcpy_asm() is somewhat aggressive.
-	 */
-	__inv_buffer((void *)newsrc, len);
-
-	/*
-	 * We're done: notify the simulator that all is back to normal,
-	 * and re-enable interrupts and pre-emption.
-	 */
-	kmap_atomic_idx_pop();
-	kmap_atomic_idx_pop();
-	sim_allow_multiple_caching(0);
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
-			       memcpy_t func)
-{
-	/*
-	 * Check if it's big enough to bother with.  We may end up doing a
-	 * small copy via TLB manipulation if we're near a page boundary,
-	 * but presumably we'll make it up when we hit the second page.
-	 */
-	while (len >= LARGE_COPY_CUTOFF) {
-		int copy_size, bytes_left_on_page;
-		pte_t *src_ptep, *dst_ptep;
-		pte_t src_pte, dst_pte;
-		struct page *src_page, *dst_page;
-
-		/* Is the source page oloc'ed to a remote cpu? */
-retry_source:
-		src_ptep = virt_to_pte(current->mm, (unsigned long)source);
-		if (src_ptep == NULL)
-			break;
-		src_pte = *src_ptep;
-		if (!hv_pte_get_present(src_pte) ||
-		    !hv_pte_get_readable(src_pte) ||
-		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
-			break;
-		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
-			break;
-		src_page = pfn_to_page(pte_pfn(src_pte));
-		get_page(src_page);
-		if (pte_val(src_pte) != pte_val(*src_ptep)) {
-			put_page(src_page);
-			goto retry_source;
-		}
-		if (pte_huge(src_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(src_pte);
-			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			src_pte = pfn_pte(pfn, src_pte);
-			src_pte = pte_mksmall(src_pte);
-		}
-
-		/* Is the destination page writable? */
-retry_dest:
-		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
-		if (dst_ptep == NULL) {
-			put_page(src_page);
-			break;
-		}
-		dst_pte = *dst_ptep;
-		if (!hv_pte_get_present(dst_pte) ||
-		    !hv_pte_get_writable(dst_pte)) {
-			put_page(src_page);
-			break;
-		}
-		dst_page = pfn_to_page(pte_pfn(dst_pte));
-		if (dst_page == src_page) {
-			/*
-			 * Source and dest are on the same page; this
-			 * potentially exposes us to incoherence if any
-			 * part of src and dest overlap on a cache line.
-			 * Just give up rather than trying to be precise.
-			 */
-			put_page(src_page);
-			break;
-		}
-		get_page(dst_page);
-		if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
-			put_page(dst_page);
-			goto retry_dest;
-		}
-		if (pte_huge(dst_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(dst_pte);
-			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			dst_pte = pfn_pte(pfn, dst_pte);
-			dst_pte = pte_mksmall(dst_pte);
-		}
-
-		/* All looks good: create a cachable PTE and copy from it */
-		copy_size = len;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
-		/* Release the pages */
-		put_page(dst_page);
-		put_page(src_page);
-
-		/* Continue on the next page */
-		dest += copy_size;
-		source += copy_size;
-		len -= copy_size;
-	}
-
-	return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return (void *)__memcpy_asm(to, from, n);
-	else
-		return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
-				      unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_to_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
-					unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-				       unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_zeroing_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
index 37440ca..88c7016 100644
--- a/arch/tile/lib/memcpy_user_64.c
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -31,6 +31,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=m" (*(p)) : "r" (v), "r" (n));		\
@@ -43,6 +44,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=r" (__v) : "m" (*(p)), "r" (n));	\
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a..2042bfe 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
 
 void *memset(void *s, int c, size_t n)
 {
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
 	int n32;
 	uint32_t v16, v32;
 	uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-	int ahead32;
-#else
 	int to_align32;
-#endif
 
 	/* Experimentation shows that a trivial tight loop is a win up until
 	 * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
 		return s;
 	}
 
-#if !CHIP_HAS_WH64()
-	/* Use a spare issue slot to start prefetching the first cache
-	 * line early. This instruction is free as the store can be buried
-	 * in otherwise idle issue slots doing ALU ops.
-	 */
-	__insn_prefetch(out8);
-
-	/* We prefetch the end so that a short memset that spans two cache
-	 * lines gets some prefetching benefit. Again we believe this is free
-	 * to issue.
-	 */
-	__insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
 	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 	while (((uintptr_t) out8 & 3) != 0) {
 		*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 
-#if !CHIP_HAS_WH64()
-
-	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
-	/* We already prefetched the first and last cache lines, so
-	 * we only need to do more prefetching if we are storing
-	 * to more than two cache lines.
-	 */
-	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-		int i;
-
-		/* Prefetch the next several cache lines.
-		 * This is the setup code for the software-pipelined
-		 * loop below.
-		 */
-#define MAX_PREFETCH 5
-		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
-		for (i = CACHE_LINE_SIZE_IN_WORDS;
-		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-			__insn_prefetch(&out32[i]);
-	}
-
-	if (n32 > ahead32) {
-		while (1) {
-			int j;
-
-			/* Prefetch by reading one word several cache lines
-			 * ahead.  Since loads are non-blocking this will
-			 * cause the full cache line to be read while we are
-			 * finishing earlier cache lines.  Using a store
-			 * here causes microarchitectural performance
-			 * problems where a victimizing store miss goes to
-			 * the head of the retry FIFO and locks the pipe for
-			 * a few cycles.  So a few subsequent stores in this
-			 * loop go into the retry FIFO, and then later
-			 * stores see other stores to the same cache line
-			 * are already in the retry FIFO and themselves go
-			 * into the retry FIFO, filling it up and grinding
-			 * to a halt waiting for the original miss to be
-			 * satisfied.
-			 */
-			__insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
-			n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
-			/* Save icache space by only partially unrolling
-			 * this loop.
-			 */
-			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-			}
-
-			/* To save compiled code size, reuse this loop even
-			 * when we run out of prefetching to do by dropping
-			 * ahead32 down.
-			 */
-			if (n32 <= ahead32) {
-				/* Not even a full cache line left,
-				 * so stop now.
-				 */
-				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-					break;
-
-				/* Choose a small enough value that we don't
-				 * prefetch past the end.  There's no sense
-				 * in touching cache lines we don't have to.
-				 */
-				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-			}
-		}
-	}
-
-#else /* CHIP_HAS_WH64() */
-
 	/* Determine how many words we need to emit before the 'out32'
 	 * pointer becomes aligned modulo the cache line size.
 	 */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
 		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 	}
 
-#endif /* CHIP_HAS_WH64() */
-
 	/* Now handle any leftover values. */
 	if (n32 != 0) {
 		do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
index 3873085..03ef69c 100644
--- a/arch/tile/lib/memset_64.c
+++ b/arch/tile/lib/memset_64.c
@@ -12,13 +12,11 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
+#include "string-endian.h"
 
 void *memset(void *s, int c, size_t n)
 {
@@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n)
 	n64 = n >> 3;
 
 	/* Tile input byte out to 64 bits. */
-	/* KLUDGE */
-	v64 = 0x0101010101010101ULL * (uint8_t)c;
+	v64 = copy_byte(c);
 
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7..841fe69 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strchr
-
 char *strchr(const char *s, int c)
 {
 	int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
index f39f9dc..fe6e31c 100644
--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@@ -26,7 +26,7 @@ char *strchr(const char *s, int c)
 	const uint64_t *p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+	const uint64_t goal = copy_byte(c);
 
 	/* Read the first aligned word, but force bytes before the string to
 	 * match neither zero nor goal (we make sure the high bit of each
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
index c0eed7c..2e49cbf 100644
--- a/arch/tile/lib/string-endian.h
+++ b/arch/tile/lib/string-endian.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
@@ -31,3 +31,14 @@
 #define CFZ(x) __insn_clz(x)
 #define REVCZ(x) __insn_ctz(x)
 #endif
+
+/*
+ * Create eight copies of the byte in a uint64_t.  Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte.  With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+	return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292..f26f88e 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strlen
-
 size_t strlen(const char *s)
 {
 	/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 0000000..1434141
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint32_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read first word, but force bytes before the string to be nonzero. */
+	v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+	while ((bits = __insn_seqb(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 0000000..2e8de6a
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint64_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read and MASK the first word. */
+	v = *p | MASK(s_int);
+
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index b62d002..1bc1622 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strnlen_user_fault
 	.popsection
 
@@ -47,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ bz r2, 2f; move r3, r0 }
-1:      { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ sb r0, r4; addi r0, r0, 1 }
-	bz r2, 2f
-	bnzt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	bz r4, 2f
+	bnzt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strncpy_from_user_fault
 	.popsection
 
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnzt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.word 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	bz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
index adb2dbb..b3b31a3 100644
--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strnlen_user_fault
 	.popsection
 
@@ -47,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ beqz r2, 2f; move r3, r0 }
-1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ st1 r0, r4; addi r0, r0, 1 }
-	beqz r2, 2f
-	bnezt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	beqz r4, 2f
+	bnezt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strncpy_from_user_fault
 	.popsection
 
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnezt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.quad 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	beqz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 743c951..23f044e 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -21,7 +21,8 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
-#include <arch/sim_def.h>
+#include <asm/vdso.h>
+#include <arch/sim.h>
 
 /* Notify a running simulator, if any, that an exec just occurred. */
 static void sim_notify_exec(const char *binary_name)
@@ -38,21 +39,55 @@ static void sim_notify_exec(const char *binary_name)
 
 static int notify_exec(struct mm_struct *mm)
 {
-	int retval = 0;  /* failure */
-
-	if (mm->exe_file) {
-		char *buf = (char *) __get_free_page(GFP_KERNEL);
-		if (buf) {
-			char *path = d_path(&mm->exe_file->f_path,
-					    buf, PAGE_SIZE);
-			if (!IS_ERR(path)) {
-				sim_notify_exec(path);
-				retval = 1;
-			}
-			free_page((unsigned long)buf);
+	char *buf, *path;
+	struct vm_area_struct *vma;
+
+	if (!sim_is_simulator())
+		return 1;
+
+	if (mm->exe_file == NULL)
+		return 0;
+
+	for (vma = current->mm->mmap; ; vma = vma->vm_next) {
+		if (vma == NULL)
+			return 0;
+		if (vma->vm_file == mm->exe_file)
+			break;
+	}
+
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return 0;
+
+	path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_page((unsigned long)buf);
+		return 0;
+	}
+
+	/*
+	 * Notify simulator of an ET_DYN object so we know the load address.
+	 * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us
+	 * to be backward-compatible with older simulator releases.
+	 */
+	if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) {
+		char buf[64];
+		int i;
+
+		snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start);
+		for (i = 0; ; ++i) {
+			char c = buf[i];
+			__insn_mtspr(SPR_SIM_CONTROL,
+				     (SIM_CONTROL_DLOPEN
+				      | (c << _SIM_CONTROL_OPERATOR_BITS)));
+			if (c == '\0')
+				break;
 		}
 	}
-	return retval;
+
+	sim_notify_exec(path);
+	free_page((unsigned long)buf);
+	return 1;
 }
 
 /* Notify a running simulator, if any, that we loaded an interpreter. */
@@ -68,37 +103,10 @@ static void sim_notify_interp(unsigned long load_addr)
 }
 
 
-/* Kernel address of page used to map read-only kernel data into userspace. */
-static void *vdso_page;
-
-/* One-entry array used for install_special_mapping. */
-static struct page *vdso_pages[1];
-
-static int __init vdso_setup(void)
-{
-	vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
-	memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
-	vdso_pages[0] = virt_to_page(vdso_page);
-	return 0;
-}
-device_initcall(vdso_setup);
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	if (vma->vm_private_data == vdso_pages)
-		return "[vdso]";
-#ifndef __tilegx__
-	if (vma->vm_start == MEM_USER_INTRPT)
-		return "[intrpt]";
-#endif
-	return NULL;
-}
-
 int arch_setup_additional_pages(struct linux_binprm *bprm,
 				int executable_stack)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long vdso_base;
 	int retval = 0;
 
 	down_write(&mm->mmap_sem);
@@ -111,14 +119,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	if (!notify_exec(mm))
 		sim_notify_exec(bprm->filename);
 
-	/*
-	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 */
-	vdso_base = VDSO_BASE;
-	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
-					 VM_READ|VM_EXEC|
-					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-					 vdso_pages);
+	retval = setup_vdso_pages();
 
 #ifndef __tilegx__
 	/*
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index f7f99f9..111d5a9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -34,6 +34,7 @@
 #include <linux/hugetlb.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/kdebug.h>
 
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
@@ -122,10 +123,9 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	pmd_k = pmd_offset(pud_k, address);
 	if (!pmd_present(*pmd_k))
 		return NULL;
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else
+	else
 		BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
 	return pmd_k;
 }
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
 		 (write ? FAULT_FLAG_WRITE : 0));
 
-	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+	is_kernel_mode = !user_mode(regs);
 
 	tsk = validate_current();
 
@@ -466,28 +466,15 @@ good_area:
 		}
 	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	/*
-	 * If this was an asynchronous fault,
-	 * restart the appropriate engine.
-	 */
-	switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
+	/* If this was a DMA TLB fault, restart the DMA engine. */
+	switch (fault_num) {
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 	case INT_DMATLB_ACCESS:
 	case INT_DMATLB_ACCESS_DWNCL:
 		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
 		break;
-#endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-		__insn_mtspr(SPR_SNCTL,
-			     __insn_mfspr(SPR_SNCTL) &
-			     ~SPR_SNCTL__FRZPROC_MASK);
-		break;
-#endif
 	}
 #endif
 
@@ -722,8 +709,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 {
 	int is_page_fault;
 
+#ifdef CONFIG_KPROBES
+	/*
+	 * This is to notify the fault handler of the kprobes.  The
+	 * exception code is redundant as it is also carried in REGS,
+	 * but we pass it anyhow.
+	 */
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1,
+		       regs->faultnum, SIGSEGV) == NOTIFY_STOP)
+		return;
+#endif
+
+#ifdef __tilegx__
+	/*
+	 * We don't need early do_page_fault_ics() support, since unlike
+	 * Pro we don't need to worry about unlocking the atomic locks.
+	 * There is only one current case in GX where we touch any memory
+	 * under ICS other than our own kernel stack, and we handle that
+	 * here.  (If we crash due to trying to touch our own stack,
+	 * we're in too much trouble for C code to help out anyway.)
+	 */
+	if (write & ~1) {
+		unsigned long pc = write & ~1;
+		if (pc >= (unsigned long) __start_unalign_asm_code &&
+		    pc < (unsigned long) __end_unalign_asm_code) {
+			struct thread_info *ti = current_thread_info();
+			/*
+			 * Our EX_CONTEXT is still what it was from the
+			 * initial unalign exception, but now we've faulted
+			 * on the JIT page.  We would like to complete the
+			 * page fault however is appropriate, and then retry
+			 * the instruction that caused the unalign exception.
+			 * Our state has been "corrupted" by setting the low
+			 * bit in "sp", and stashing r0..r3 in the
+			 * thread_info area, so we revert all of that, then
+			 * continue as if this were a normal page fault.
+			 */
+			regs->sp &= ~1UL;
+			regs->regs[0] = ti->unalign_jit_tmp[0];
+			regs->regs[1] = ti->unalign_jit_tmp[1];
+			regs->regs[2] = ti->unalign_jit_tmp[2];
+			regs->regs[3] = ti->unalign_jit_tmp[3];
+			write &= 1;
+		} else {
+			pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n",
+				 current->comm, current->pid, pc, address);
+			show_regs(regs);
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+#else
 	/* This case should have been handled by do_page_fault_ics(). */
 	BUG_ON(write & ~1);
+#endif
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -752,10 +791,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 #endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-#endif
 		is_page_fault = 1;
 		break;
 
@@ -771,8 +806,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 		panic("Bad fault number %d in do_page_fault", fault_num);
 	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	if (EX1_PL(regs->ex1) != USER_PL) {
+#if CHIP_HAS_TILE_DMA()
+	if (!user_mode(regs)) {
 		struct async_tlb *async;
 		switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
@@ -783,12 +818,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 			async = &current->thread.dma_async_tlb;
 			break;
 #endif
-#if CHIP_HAS_SN_PROC()
-		case INT_SNITLB_MISS:
-		case INT_SNITLB_MISS_DWNCL:
-			async = &current->thread.sn_async_tlb;
-			break;
-#endif
 		default:
 			async = NULL;
 		}
@@ -821,14 +850,22 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 }
 
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 /*
- * Check an async_tlb structure to see if a deferred fault is waiting,
- * and if so pass it to the page-fault code.
+ * This routine effectively re-issues asynchronous page faults
+ * when we are returning to user space.
  */
-static void handle_async_page_fault(struct pt_regs *regs,
-				    struct async_tlb *async)
+void do_async_page_fault(struct pt_regs *regs)
 {
+	struct async_tlb *async = &current->thread.dma_async_tlb;
+
+	/*
+	 * Clear thread flag early.  If we re-interrupt while processing
+	 * code here, we will reset it and recall this routine before
+	 * returning to user space.
+	 */
+	clear_thread_flag(TIF_ASYNC_TLB);
+
 	if (async->fault_num) {
 		/*
 		 * Clear async->fault_num before calling the page-fault
@@ -842,35 +879,15 @@ static void handle_async_page_fault(struct pt_regs *regs,
 				  async->address, async->is_write);
 	}
 }
-
-/*
- * This routine effectively re-issues asynchronous page faults
- * when we are returning to user space.
- */
-void do_async_page_fault(struct pt_regs *regs)
-{
-	/*
-	 * Clear thread flag early.  If we re-interrupt while processing
-	 * code here, we will reset it and recall this routine before
-	 * returning to user space.
-	 */
-	clear_thread_flag(TIF_ASYNC_TLB);
-
-#if CHIP_HAS_TILE_DMA()
-	handle_async_page_fault(regs, &current->thread.dma_async_tlb);
-#endif
-#if CHIP_HAS_SN_PROC()
-	handle_async_page_fault(regs, &current->thread.sn_async_tlb);
-#endif
-}
-#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
+#endif /* CHIP_HAS_TILE_DMA() */
 
 
 void vmalloc_sync_all(void)
 {
 #ifdef __tilegx__
 	/* Currently all L1 kernel pmd's are static and shared. */
-	BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
+	BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) !=
+		     pgd_index(VMALLOC_START));
 #else
 	/*
 	 * Note that races in the updates of insync and start aren't
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 347d123..0dc2182 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, int type,
 
 	list_add(&amp->list, &amp_list);
 	set_pte(ptep, pteval);
-	arch_flush_lazy_mmu_mode();
 
 	spin_unlock(&amp_lock);
 	homecache_kpte_unlock(flags);
@@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)
 		BUG_ON(vaddr >= (unsigned long)high_memory);
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 1ae9119..004ba56 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -43,12 +43,9 @@
 #include "migrate.h"
 
 
-#if CHIP_HAS_COHERENT_LOCAL_CACHE()
-
 /*
  * The noallocl2 option suppresses all use of the L2 cache to cache
- * locally from a remote home.  There's no point in using it if we
- * don't have coherent local caching, though.
+ * locally from a remote home.
  */
 static int __write_once noallocl2;
 static int __init set_noallocl2(char *str)
@@ -58,12 +55,6 @@ static int __init set_noallocl2(char *str)
 }
 early_param("noallocl2", set_noallocl2);
 
-#else
-
-#define noallocl2 0
-
-#endif
-
 
 /*
  * Update the irq_stat for cpus that we are going to interrupt
@@ -172,7 +163,8 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 
 static void homecache_finv_page_va(void* va, int home)
 {
-	if (home == smp_processor_id()) {
+	int cpu = get_cpu();
+	if (home == cpu) {
 		finv_buffer_local(va, PAGE_SIZE);
 	} else if (home == PAGE_HOME_HASH) {
 		finv_buffer_remote(va, PAGE_SIZE, 1);
@@ -180,6 +172,7 @@ static void homecache_finv_page_va(void* va, int home)
 		BUG_ON(home < 0 || home >= NR_CPUS);
 		finv_buffer_remote(va, PAGE_SIZE, 0);
 	}
+	put_cpu();
 }
 
 void homecache_finv_map_page(struct page *page, int home)
@@ -198,7 +191,7 @@ void homecache_finv_map_page(struct page *page, int home)
 #else
 	va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());
 #endif
-	ptep = virt_to_pte(NULL, (unsigned long)va);
+	ptep = virt_to_kpte(va);
 	pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
 	__set_pte(ptep, pte_set_home(pte, home));
 	homecache_finv_page_va((void *)va, home);
@@ -263,10 +256,8 @@ static int pte_to_home(pte_t pte)
 		return PAGE_HOME_INCOHERENT;
 	case HV_PTE_MODE_UNCACHED:
 		return PAGE_HOME_UNCACHED;
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case HV_PTE_MODE_CACHE_HASH_L3:
 		return PAGE_HOME_HASH;
-#endif
 	}
 	panic("Bad PTE %#llx\n", pte.val);
 }
@@ -323,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)
 						      HV_PTE_MODE_CACHE_NO_L3);
 			}
 		} else
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (hash_default)
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		else
-#endif
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 		pte = hv_pte_set_nc(pte);
 		break;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case PAGE_HOME_HASH:
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		break;
-#endif
 
 	default:
 		BUG_ON(home < 0 || home >= NR_CPUS ||
@@ -346,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)
 		break;
 	}
 
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	if (noallocl2)
 		pte = hv_pte_set_no_alloc_l2(pte);
 
@@ -355,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)
 	    hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
 	}
-#endif
 
 	/* Checking this case here gives a better panic than from the hv. */
 	BUG_ON(hv_pte_get_mode(pte) == 0);
@@ -371,19 +356,13 @@ EXPORT_SYMBOL(pte_set_home);
  * so they're not suitable for anything but infrequent use.
  */
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
-#else
-static inline int initial_page_home(void) { return 0; }
-#endif
-
 int page_home(struct page *page)
 {
 	if (PageHighMem(page)) {
-		return initial_page_home();
+		return PAGE_HOME_HASH;
 	} else {
 		unsigned long kva = (unsigned long)page_address(page);
-		return pte_to_home(*virt_to_pte(NULL, kva));
+		return pte_to_home(*virt_to_kpte(kva));
 	}
 }
 EXPORT_SYMBOL(page_home);
@@ -402,7 +381,7 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		     NULL, 0);
 
 	for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
-		pte_t *ptep = virt_to_pte(NULL, kva);
+		pte_t *ptep = virt_to_kpte(kva);
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
 		__set_pte(ptep, pte_set_home(pteval, home));
@@ -436,7 +415,7 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
 void __homecache_free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
-		homecache_change_page_home(page, order, initial_page_home());
+		homecache_change_page_home(page, order, PAGE_HOME_HASH);
 		if (order == 0) {
 			free_hot_cold_page(page, 0);
 		} else {
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 650ccff..e514899 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -49,38 +49,6 @@ int huge_shift[HUGE_SHIFT_ENTRIES] = {
 #endif
 };
 
-/*
- * This routine is a hybrid of pte_alloc_map() and pte_alloc_kernel().
- * It assumes that L2 PTEs are never in HIGHMEM (we don't support that).
- * It locks the user pagetable, and bumps up the mm->nr_ptes field,
- * but otherwise allocate the page table using the kernel versions.
- */
-static pte_t *pte_alloc_hugetlb(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long address)
-{
-	pte_t *new;
-
-	if (pmd_none(*pmd)) {
-		new = pte_alloc_one_kernel(mm, address);
-		if (!new)
-			return NULL;
-
-		smp_wmb(); /* See comment in __pte_alloc */
-
-		spin_lock(&mm->page_table_lock);
-		if (likely(pmd_none(*pmd))) {  /* Has another populated it ? */
-			mm->nr_ptes++;
-			pmd_populate_kernel(mm, pmd, new);
-			new = NULL;
-		} else
-			VM_BUG_ON(pmd_trans_splitting(*pmd));
-		spin_unlock(&mm->page_table_lock);
-		if (new)
-			pte_free_kernel(mm, new);
-	}
-
-	return pte_offset_kernel(pmd, address);
-}
 #endif
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
@@ -109,7 +77,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		else {
 			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
 				panic("Unexpected page size %#lx\n", sz);
-			return pte_alloc_hugetlb(mm, pmd, addr);
+			return pte_alloc_map(mm, NULL, pmd, addr);
 		}
 	}
 #else
@@ -144,14 +112,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 	/* Get the top-level page table entry. */
 	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
-	if (!pgd_present(*pgd))
-		return NULL;
 
 	/* We don't have four levels. */
 	pud = pud_offset(pgd, addr);
 #ifndef __PAGETABLE_PUD_FOLDED
 # error support fourth page table level
 #endif
+	if (!pud_present(*pud))
+		return NULL;
 
 	/* Check for an L0 huge PTE, if we have three levels. */
 #ifndef __PAGETABLE_PMD_FOLDED
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index e182958..4e316deb 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -106,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)
  */
 static int initial_heap_home(void)
 {
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (hash_default)
 		return PAGE_HOME_HASH;
-#endif
 	return smp_processor_id();
 }
 
@@ -190,14 +188,11 @@ static void __init page_table_range_init(unsigned long start,
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-
 static int __initdata ktext_hash = 1;  /* .text pages */
 static int __initdata kdata_hash = 1;  /* .data and .bss pages */
 int __write_once hash_default = 1;     /* kernel allocator pages */
 EXPORT_SYMBOL(hash_default);
 int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */
-#endif /* CHIP_HAS_CBOX_HOME_MAP */
 
 /*
  * CPUs to use to for striping the pages of kernel data.  If hash-for-home
@@ -215,14 +210,12 @@ int __write_once kdata_huge;       /* if no homecaching, small pages */
 static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
 {
 	prot = pte_set_home(prot, home);
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (home == PAGE_HOME_IMMUTABLE) {
 		if (ktext_hash)
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
 		else
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
 	}
-#endif
 	return prot;
 }
 
@@ -234,22 +227,17 @@ static pgprot_t __init init_pgprot(ulong address)
 {
 	int cpu;
 	unsigned long page;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* For kdata=huge, everything is just hash-for-home. */
 	if (kdata_huge)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
 
 	/* We map the aliased pages of permanent text inaccessible. */
 	if (address < (ulong) _sinittext - CODE_DELTA)
 		return PAGE_NONE;
 
-	/*
-	 * We map read-only data non-coherent for performance.  We could
-	 * use neighborhood caching on TILE64, but it's not clear it's a win.
-	 */
+	/* We map read-only data non-coherent for performance. */
 	if ((address >= (ulong) __start_rodata &&
 	     address < (ulong) __end_rodata) ||
 	    address == (ulong) empty_zero_page) {
@@ -257,12 +245,10 @@ static pgprot_t __init init_pgprot(ulong address)
 	}
 
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	/* Force the atomic_locks[] array page to be hash-for-home. */
 	if (address == (ulong) atomic_locks)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 #endif
-#endif
 
 	/*
 	 * Everything else that isn't data or bss is heap, so mark it
@@ -280,19 +266,9 @@ static pgprot_t __init init_pgprot(ulong address)
 	if (address >= (ulong) _end || address < (ulong) _einitdata)
 		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Use hash-for-home if requested for data/bss. */
 	if (kdata_hash)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
-
-	/*
-	 * Make the w1data homed like heap to start with, to avoid
-	 * making it part of the page-striped data area when we're just
-	 * going to convert it to read-only soon anyway.
-	 */
-	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
-		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
 	/*
 	 * Otherwise we just hand out consecutive cpus.  To avoid
@@ -301,7 +277,7 @@ static pgprot_t __init init_pgprot(ulong address)
 	 * the requested address, while walking cpu home around kdata_mask.
 	 * This is typically no more than a dozen or so iterations.
 	 */
-	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
 	BUG_ON(address < page || address >= (ulong)_end);
 	cpu = cpumask_first(&kdata_mask);
 	for (; page < address; page += PAGE_SIZE) {
@@ -311,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)
 		if (page == (ulong)empty_zero_page)
 			continue;
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 		if (page == (ulong)atomic_locks)
 			continue;
 #endif
-#endif
 		cpu = cpumask_next(cpu, &kdata_mask);
 		if (cpu == NR_CPUS)
 			cpu = cpumask_first(&kdata_mask);
@@ -358,7 +332,7 @@ static int __init setup_ktext(char *str)
 
 	ktext_arg_seen = 1;
 
-	/* Default setting on Tile64: use a huge page */
+	/* Default setting: use a huge page */
 	if (strcmp(str, "huge") == 0)
 		pr_info("ktext: using one huge locally cached page\n");
 
@@ -404,10 +378,8 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)
 {
 	if (!ktext_nocache)
 		prot = hv_pte_set_nc(prot);
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	else
 		prot = hv_pte_set_no_alloc_l2(prot);
-#endif
 	return prot;
 }
 
@@ -440,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	struct cpumask kstripe_mask;
 	int rc, i;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (ktext_arg_seen && ktext_hash) {
 		pr_warning("warning: \"ktext\" boot argument ignored"
 			   " if \"kcache_hash\" sets up text hash-for-home\n");
@@ -457,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			  " kcache_hash=all or =allbutstack\n");
 		kdata_huge = 0;
 	}
-#endif
 
 	/*
 	 * Set up a mask for cpus to use for kernel striping.
@@ -538,7 +508,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	address = MEM_SV_INTRPT;
+	address = MEM_SV_START;
 	pmd = get_pmd(pgtables, address);
 	pfn = 0;  /* code starts at PA 0 */
 	if (ktext_small) {
@@ -585,13 +555,11 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	} else {
 		pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
 		pteval = pte_mkhuge(pteval);
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (ktext_hash) {
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_HASH_L3);
 			pteval = ktext_set_nocache(pteval);
 		} else
-#endif /* CHIP_HAS_CBOX_HOME_MAP() */
 		if (cpumask_weight(&ktext_mask) == 1) {
 			pteval = set_remote_cache_cpu(pteval,
 					      cpumask_first(&ktext_mask));
@@ -777,10 +745,7 @@ void __init paging_init(void)
 
 	kernel_physical_mapping_init(pgd_base);
 
-	/*
-	 * Fixed mappings, only the page table structure has to be
-	 * created - mappings will be set by set_fixmap():
-	 */
+	/* Fixed mappings, only the page table structure has to be created. */
 	page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1),
 			      FIXADDR_TOP, pgd_base);
 
@@ -941,26 +906,6 @@ void __init pgtable_cache_init(void)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
 
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-/*
- * The __w1data area holds data that is only written during initialization,
- * and is read-only and thus freely cacheable thereafter.  Fix the page
- * table entries that cover that region accordingly.
- */
-static void mark_w1data_ro(void)
-{
-	/* Loop over page table entries */
-	unsigned long addr = (unsigned long)__w1data_begin;
-	BUG_ON((addr & (PAGE_SIZE-1)) != 0);
-	for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
-		unsigned long pfn = kaddr_to_pfn((void *)addr);
-		pte_t *ptep = virt_to_pte(NULL, addr);
-		BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */
-		set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
-	}
-}
-#endif
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static long __write_once initfree;
 #else
@@ -1000,7 +945,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		 */
 		int pfn = kaddr_to_pfn((void *)addr);
 		struct page *page = pfn_to_page(pfn);
-		pte_t *ptep = virt_to_pte(NULL, addr);
+		pte_t *ptep = virt_to_kpte(addr);
 		if (!initfree) {
 			/*
 			 * If debugging page accesses then do not free
@@ -1024,15 +969,11 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
 
 	/*
-	 * Evict the dirty initdata on the boot cpu, evict the w1data
-	 * wherever it's homed, and evict all the init code everywhere.
-	 * We are guaranteed that no one will touch the init pages any
-	 * more, and although other cpus may be touching the w1data,
-	 * we only actually change the caching on tile64, which won't
-	 * be keeping local copies in the other tiles' caches anyway.
+	 * Evict the cache on all cores to avoid incoherence.
+	 * We are guaranteed that no one will touch the init pages any more.
 	 */
 	homecache_evict(&cpu_cacheable_map);
 
@@ -1043,26 +984,11 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from MEM_SV_INTRPT that we won't use again after init.
+	 * pages from MEM_SV_START that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
 			(unsigned long)_einittext - text_delta);
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-	/*
-	 * Upgrade the .w1data section to globally cached.
-	 * We don't do this on tilepro, since the cache architecture
-	 * pretty much makes it irrelevant, and in any case we end
-	 * up having racing issues with other tiles that may touch
-	 * the data after we flush the cache but before we update
-	 * the PTEs and flush the TLBs, causing sharer shootdowns
-	 * later.  Even though this is to clean data, it seems like
-	 * an unnecessary complication.
-	 */
-	mark_w1data_ro();
-#endif
-
 	/* Do a global TLB flush so everyone sees the changes. */
 	flush_tlb_all();
 }
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index 5305814..7720854 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -136,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r8, zero  /* asids */
 	 move r9, zero  /* asidcount */
 	}
-	jal hv_flush_remote
+	jal _hv_flush_remote
 	bnz r0, .Ldone
 
 	/* Now install the new page table. */
@@ -152,7 +152,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r4, r_asid
 	 moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnz r0, .Ldone
 
 	/* Finally, flush the TLB. */
diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S
index 1d15b108..a49eee3 100644
--- a/arch/tile/mm/migrate_64.S
+++ b/arch/tile/mm/migrate_64.S
@@ -123,7 +123,7 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r8, zero  /* asidcount */
-	 jal hv_flush_remote
+	 jal _hv_flush_remote
 	}
 	bnez r0, 1f
 
@@ -136,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r2, r_asid
 	 moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnez r0, 1f
 
 	/* Finally, flush the TLB. */
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index d67d91e..851a94e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -58,16 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 #else
 	int is_32bit = 0;
 #endif
+	unsigned long random_factor = 0UL;
+
+	/*
+	 *  8 bits of randomness in 32bit mmaps, 24 address space bits
+	 * 12 bits of randomness in 64bit mmaps, 28 address space bits
+	 */
+	if (current->flags & PF_RANDOMIZE) {
+		if (is_32bit)
+			random_factor = get_random_int() % (1<<8);
+		else
+			random_factor = get_random_int() % (1<<12);
+
+		random_factor <<= PAGE_SHIFT;
+	}
 
 	/*
 	 * Use standard layout if the expected stack growth is unlimited
 	 * or we are running native 64 bits.
 	 */
-	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
 		mm->mmap_base = mmap_base(mm);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index dfd63ce..2deaddf 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -83,55 +83,6 @@ void show_mem(unsigned int filter)
 	}
 }
 
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
- */
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		BUG();
-		return;
-	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		BUG();
-		return;
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		BUG();
-		return;
-	}
-	pte = pte_offset_kernel(pmd, vaddr);
-	/* <pfn,flags> stored as-is, to permit clearing entries */
-	set_pte(pte, pfn_pte(pfn, flags));
-
-	/*
-	 * It's enough to flush this one mapping.
-	 * This appears conservative since it is only called
-	 * from __set_fixmap.
-	 */
-	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
-}
-
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
-		return;
-	}
-	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-}
-
 /**
  * shatter_huge_page() - ensure a given address is mapped by a small page.
  *
@@ -374,6 +325,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,
 
 #endif
 
+/*
+ * Return a pointer to the PTE that corresponds to the given
+ * address in the given page table.  A NULL page table just uses
+ * the standard kernel page table; the preferred API in this case
+ * is virt_to_kpte().
+ *
+ * The returned pointer can point to a huge page in other levels
+ * of the page table than the bottom, if the huge page is present
+ * in the page table.  For bottom-level PTEs, the returned pointer
+ * can point to a PTE that is either present or not.
+ */
 pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -387,13 +349,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 	pud = pud_offset(pgd, addr);
 	if (!pud_present(*pud))
 		return NULL;
+	if (pud_huge_page(*pud))
+		return (pte_t *)pud;
 	pmd = pmd_offset(pud, addr);
-	if (pmd_huge_page(*pmd))
-		return (pte_t *)pmd;
 	if (!pmd_present(*pmd))
 		return NULL;
+	if (pmd_huge_page(*pmd))
+		return (pte_t *)pmd;
 	return pte_offset_kernel(pmd, addr);
 }
+EXPORT_SYMBOL(virt_to_pte);
+
+pte_t *virt_to_kpte(unsigned long kaddr)
+{
+	BUG_ON(kaddr < PAGE_OFFSET);
+	return virt_to_pte(NULL, kaddr);
+}
+EXPORT_SYMBOL(virt_to_kpte);
 
 pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
 {
@@ -568,7 +540,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 	addr = area->addr;
 	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
 			       phys_addr, pgprot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		free_vm_area(area);
 		return NULL;
 	}
 	return (__force void __iomem *) (offset + (char *)addr);
diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c
index a082053..578f915 100644
--- a/drivers/edac/tile_edac.c
+++ b/drivers/edac/tile_edac.c
@@ -257,7 +257,6 @@ static void __exit tile_edac_exit(void)
 		if (!pdev)
 			continue;
 
-		platform_set_drvdata(pdev, NULL);
 		platform_device_unregister(pdev);
 	}
 	platform_driver_unregister(&tile_edac_mc_driver);
diff --git a/drivers/tty/hvc/hvc_tile.c b/drivers/tty/hvc/hvc_tile.c
index 7a84a05..af8cdaa 100644
--- a/drivers/tty/hvc/hvc_tile.c
+++ b/drivers/tty/hvc/hvc_tile.c
@@ -18,16 +18,46 @@
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/moduleparam.h>
+#include <linux/platform_device.h>
 #include <linux/types.h>
 
+#include <asm/setup.h>
+#include <arch/sim_def.h>
+
 #include <hv/hypervisor.h>
 
 #include "hvc_console.h"
 
+static int use_sim_console;
+static int __init sim_console(char *str)
+{
+	use_sim_console = 1;
+	return 0;
+}
+early_param("sim_console", sim_console);
+
+int tile_console_write(const char *buf, int count)
+{
+	if (unlikely(use_sim_console)) {
+		int i;
+		for (i = 0; i < count; ++i)
+			__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+				     (buf[i] << _SIM_CONTROL_OPERATOR_BITS));
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+			     (SIM_PUTC_FLUSH_BINARY <<
+			      _SIM_CONTROL_OPERATOR_BITS));
+		return 0;
+	} else {
+		return hv_console_write((HV_VirtAddr)buf, count);
+	}
+}
+
 static int hvc_tile_put_chars(uint32_t vt, const char *buf, int count)
 {
-	return hv_console_write((HV_VirtAddr)buf, count);
+	return tile_console_write(buf, count);
 }
 
 static int hvc_tile_get_chars(uint32_t vt, char *buf, int count)
@@ -44,25 +74,132 @@ static int hvc_tile_get_chars(uint32_t vt, char *buf, int count)
 	return i;
 }
 
+#ifdef __tilegx__
+/*
+ * IRQ based callbacks.
+ */
+static int hvc_tile_notifier_add_irq(struct hvc_struct *hp, int irq)
+{
+	int rc;
+	int cpu = raw_smp_processor_id();  /* Choose an arbitrary cpu */
+	HV_Coord coord = { .x = cpu_x(cpu), .y = cpu_y(cpu) };
+
+	rc = notifier_add_irq(hp, irq);
+	if (rc)
+		return rc;
+
+	/*
+	 * Request that the hypervisor start sending us interrupts.
+	 * If the hypervisor returns an error, we still return 0, so that
+	 * we can fall back to polling.
+	 */
+	if (hv_console_set_ipi(KERNEL_PL, irq, coord) < 0)
+		notifier_del_irq(hp, irq);
+
+	return 0;
+}
+
+static void hvc_tile_notifier_del_irq(struct hvc_struct *hp, int irq)
+{
+	HV_Coord coord = { 0, 0 };
+
+	/* Tell the hypervisor to stop sending us interrupts. */
+	hv_console_set_ipi(KERNEL_PL, -1, coord);
+
+	notifier_del_irq(hp, irq);
+}
+
+static void hvc_tile_notifier_hangup_irq(struct hvc_struct *hp, int irq)
+{
+	hvc_tile_notifier_del_irq(hp, irq);
+}
+#endif
+
 static const struct hv_ops hvc_tile_get_put_ops = {
 	.get_chars = hvc_tile_get_chars,
 	.put_chars = hvc_tile_put_chars,
+#ifdef __tilegx__
+	.notifier_add = hvc_tile_notifier_add_irq,
+	.notifier_del = hvc_tile_notifier_del_irq,
+	.notifier_hangup = hvc_tile_notifier_hangup_irq,
+#endif
+};
+
+
+#ifdef __tilegx__
+static int hvc_tile_probe(struct platform_device *pdev)
+{
+	struct hvc_struct *hp;
+	int tile_hvc_irq;
+
+	/* Create our IRQ and register it. */
+	tile_hvc_irq = create_irq();
+	if (tile_hvc_irq < 0)
+		return -ENXIO;
+
+	tile_irq_activate(tile_hvc_irq, TILE_IRQ_PERCPU);
+	hp = hvc_alloc(0, tile_hvc_irq, &hvc_tile_get_put_ops, 128);
+	if (IS_ERR(hp)) {
+		destroy_irq(tile_hvc_irq);
+		return PTR_ERR(hp);
+	}
+	dev_set_drvdata(&pdev->dev, hp);
+
+	return 0;
+}
+
+static int hvc_tile_remove(struct platform_device *pdev)
+{
+	int rc;
+	struct hvc_struct *hp = dev_get_drvdata(&pdev->dev);
+
+	rc = hvc_remove(hp);
+	if (rc == 0)
+		destroy_irq(hp->data);
+
+	return rc;
+}
+
+static void hvc_tile_shutdown(struct platform_device *pdev)
+{
+	struct hvc_struct *hp = dev_get_drvdata(&pdev->dev);
+
+	hvc_tile_notifier_del_irq(hp, hp->data);
+}
+
+static struct platform_device hvc_tile_pdev = {
+	.name           = "hvc-tile",
+	.id             = 0,
+};
+
+static struct platform_driver hvc_tile_driver = {
+	.probe          = hvc_tile_probe,
+	.remove         = hvc_tile_remove,
+	.shutdown	= hvc_tile_shutdown,
+	.driver         = {
+		.name   = "hvc-tile",
+		.owner  = THIS_MODULE,
+	}
 };
+#endif
 
 static int __init hvc_tile_console_init(void)
 {
-	extern void disable_early_printk(void);
 	hvc_instantiate(0, 0, &hvc_tile_get_put_ops);
 	add_preferred_console("hvc", 0, NULL);
-	disable_early_printk();
 	return 0;
 }
 console_initcall(hvc_tile_console_init);
 
 static int __init hvc_tile_init(void)
 {
-	struct hvc_struct *s;
-	s = hvc_alloc(0, 0, &hvc_tile_get_put_ops, 128);
-	return IS_ERR(s) ? PTR_ERR(s) : 0;
+#ifndef __tilegx__
+	struct hvc_struct *hp;
+	hp = hvc_alloc(0, 0, &hvc_tile_get_put_ops, 128);
+	return IS_ERR(hp) ? PTR_ERR(hp) : 0;
+#else
+	platform_device_register(&hvc_tile_pdev);
+	return platform_driver_register(&hvc_tile_driver);
+#endif
 }
 device_initcall(hvc_tile_init);
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index cc4c868..47c6e7b 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1439,6 +1439,15 @@ config SERIAL_EFM32_UART_CONSOLE
 	depends on SERIAL_EFM32_UART=y
 	select SERIAL_CORE_CONSOLE
 
+config SERIAL_TILEGX
+	tristate "TILE-Gx on-chip serial port support"
+	depends on TILEGX
+	select TILE_GXIO_UART
+	select SERIAL_CORE
+	---help---
+	  This device provides access to the on-chip UARTs on the TILE-Gx
+	  processor.
+
 config SERIAL_ARC
 	tristate "ARC UART driver support"
 	select SERIAL_CORE
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 47b679c..3068c77 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_SERIAL_OMAP) += omap-serial.o
 obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o
 obj-$(CONFIG_SERIAL_ST_ASC) += st-asc.o
+obj-$(CONFIG_SERIAL_TILEGX) += tilegx.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
 obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
diff --git a/drivers/tty/serial/tilegx.c b/drivers/tty/serial/tilegx.c
new file mode 100644
index 0000000..f92d7e6
--- /dev/null
+++ b/drivers/tty/serial/tilegx.c
@@ -0,0 +1,708 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILEGx UART driver.
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#include <gxio/common.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_uart.h>
+#include <gxio/kiorpc.h>
+
+#include <hv/drv_uart_intf.h>
+
+/*
+ * Use device name ttyS, major 4, minor 64-65.
+ * This is the usual serial port name, 8250 conventional range.
+ */
+#define TILEGX_UART_MAJOR	TTY_MAJOR
+#define TILEGX_UART_MINOR	64
+#define TILEGX_UART_NAME	"ttyS"
+#define DRIVER_NAME_STRING	"TILEGx_Serial"
+#define TILEGX_UART_REF_CLK	125000000; /* REF_CLK is always 125 MHz. */
+
+struct tile_uart_port {
+	/* UART port. */
+	struct uart_port	uart;
+
+	/* GXIO device context. */
+	gxio_uart_context_t	context;
+
+	/* UART access mutex. */
+	struct mutex		mutex;
+
+	/* CPU receiving interrupts. */
+	int			irq_cpu;
+};
+
+static struct tile_uart_port tile_uart_ports[TILEGX_UART_NR];
+static struct uart_driver tilegx_uart_driver;
+
+
+/*
+ * Read UART rx fifo, and insert the chars into tty buffer.
+ */
+static void receive_chars(struct tile_uart_port *tile_uart,
+			  struct tty_struct *tty)
+{
+	int i;
+	char c;
+	UART_FIFO_COUNT_t count;
+	gxio_uart_context_t *context = &tile_uart->context;
+	struct tty_port *port = tty->port;
+
+	count.word = gxio_uart_read(context, UART_FIFO_COUNT);
+	for (i = 0; i < count.rfifo_count; i++) {
+		c = (char)gxio_uart_read(context, UART_RECEIVE_DATA);
+		tty_insert_flip_char(port, c, TTY_NORMAL);
+	}
+}
+
+
+/*
+ * Drain the Rx FIFO, called by interrupt handler.
+ */
+static void handle_receive(struct tile_uart_port *tile_uart)
+{
+	struct tty_port *port = &tile_uart->uart.state->port;
+	struct tty_struct *tty = tty_port_tty_get(port);
+	gxio_uart_context_t *context = &tile_uart->context;
+
+	if (!tty)
+		return;
+
+	/* First read UART rx fifo. */
+	receive_chars(tile_uart, tty);
+
+	/* Reset RFIFO_WE interrupt. */
+	gxio_uart_write(context, UART_INTERRUPT_STATUS,
+			UART_INTERRUPT_MASK__RFIFO_WE_MASK);
+
+	/* Final read, if any chars comes between the first read and
+	 * the interrupt reset.
+	 */
+	receive_chars(tile_uart, tty);
+
+	spin_unlock(&tile_uart->uart.lock);
+	tty_flip_buffer_push(port);
+	spin_lock(&tile_uart->uart.lock);
+	tty_kref_put(tty);
+}
+
+
+/*
+ * Push one char to UART Write FIFO.
+ * Return 0 on success, -1 if write filo is full.
+ */
+static int tilegx_putchar(gxio_uart_context_t *context, char c)
+{
+	UART_FLAG_t flag;
+	flag.word = gxio_uart_read(context, UART_FLAG);
+	if (flag.wfifo_full)
+		return -1;
+
+	gxio_uart_write(context, UART_TRANSMIT_DATA, (unsigned long)c);
+	return 0;
+}
+
+
+/*
+ * Send chars to UART Write FIFO; called by interrupt handler.
+ */
+static void handle_transmit(struct tile_uart_port *tile_uart)
+{
+	unsigned char ch;
+	struct uart_port *port;
+	struct circ_buf *xmit;
+	gxio_uart_context_t *context = &tile_uart->context;
+
+	/* First reset WFIFO_RE interrupt. */
+	gxio_uart_write(context, UART_INTERRUPT_STATUS,
+			UART_INTERRUPT_MASK__WFIFO_RE_MASK);
+
+	port = &tile_uart->uart;
+	xmit = &port->state->xmit;
+	if (port->x_char) {
+		if (tilegx_putchar(context, port->x_char))
+			return;
+		port->x_char = 0;
+		port->icount.tx++;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return;
+
+	while (!uart_circ_empty(xmit)) {
+		ch = xmit->buf[xmit->tail];
+		if (tilegx_putchar(context, ch))
+			break;
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	/* Reset WFIFO_RE interrupt. */
+	gxio_uart_write(context, UART_INTERRUPT_STATUS,
+			UART_INTERRUPT_MASK__WFIFO_RE_MASK);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+}
+
+
+/*
+ * UART Interrupt handler.
+ */
+static irqreturn_t tilegx_interrupt(int irq, void *dev_id)
+{
+	unsigned long flags;
+	UART_INTERRUPT_STATUS_t intr_stat;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+	struct uart_port *port = dev_id;
+	irqreturn_t ret = IRQ_NONE;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	context = &tile_uart->context;
+	intr_stat.word = gxio_uart_read(context, UART_INTERRUPT_STATUS);
+
+	if (intr_stat.rfifo_we) {
+		handle_receive(tile_uart);
+		ret = IRQ_HANDLED;
+	}
+	if (intr_stat.wfifo_re) {
+		handle_transmit(tile_uart);
+		ret = IRQ_HANDLED;
+	}
+
+	spin_unlock_irqrestore(&port->lock, flags);
+	return ret;
+}
+
+
+/*
+ * Return TIOCSER_TEMT when transmitter FIFO is empty.
+ */
+static u_int tilegx_tx_empty(struct uart_port *port)
+{
+	int ret;
+	UART_FLAG_t flag;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (!mutex_trylock(&tile_uart->mutex))
+		return 0;
+	context = &tile_uart->context;
+
+	flag.word = gxio_uart_read(context, UART_FLAG);
+	ret = (flag.wfifo_empty) ? TIOCSER_TEMT : 0;
+	mutex_unlock(&tile_uart->mutex);
+
+	return ret;
+}
+
+
+/*
+ * Set state of the modem control output lines.
+ */
+static void tilegx_set_mctrl(struct uart_port *port, u_int mctrl)
+{
+	/* N/A */
+}
+
+
+/*
+ * Get state of the modem control input lines.
+ */
+static u_int tilegx_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+}
+
+
+/*
+ * Stop transmitting.
+ */
+static void tilegx_stop_tx(struct uart_port *port)
+{
+	/* N/A */
+}
+
+
+/*
+ * Start transmitting.
+ */
+static void tilegx_start_tx(struct uart_port *port)
+{
+	unsigned char ch;
+	struct circ_buf *xmit;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (!mutex_trylock(&tile_uart->mutex))
+		return;
+	context = &tile_uart->context;
+	xmit = &port->state->xmit;
+	if (port->x_char) {
+		if (tilegx_putchar(context, port->x_char))
+			return;
+		port->x_char = 0;
+		port->icount.tx++;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		mutex_unlock(&tile_uart->mutex);
+		return;
+	}
+
+	while (!uart_circ_empty(xmit)) {
+		ch = xmit->buf[xmit->tail];
+		if (tilegx_putchar(context, ch))
+			break;
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	mutex_unlock(&tile_uart->mutex);
+}
+
+
+/*
+ * Stop receiving - port is in process of being closed.
+ */
+static void tilegx_stop_rx(struct uart_port *port)
+{
+	int err;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+	int cpu;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (!mutex_trylock(&tile_uart->mutex))
+		return;
+
+	context = &tile_uart->context;
+	cpu = tile_uart->irq_cpu;
+	err = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu),
+				      KERNEL_PL, -1);
+	mutex_unlock(&tile_uart->mutex);
+}
+
+
+/*
+ * Enable modem status interrupts.
+ */
+static void tilegx_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+/*
+ * Control the transmission of a break signal.
+ */
+static void tilegx_break_ctl(struct uart_port *port, int break_state)
+{
+	/* N/A */
+}
+
+
+/*
+ * Perform initialization and enable port for reception.
+ */
+static int tilegx_startup(struct uart_port *port)
+{
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+	int ret = 0;
+	int cpu = raw_smp_processor_id();  /* pick an arbitrary cpu */
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (mutex_lock_interruptible(&tile_uart->mutex))
+		return -EBUSY;
+	context = &tile_uart->context;
+
+	/* Now open the hypervisor device if we haven't already. */
+	if (context->fd < 0) {
+		UART_INTERRUPT_MASK_t intr_mask;
+
+		/* Initialize UART device. */
+		ret = gxio_uart_init(context, port->line);
+		if (ret) {
+			ret = -ENXIO;
+			goto err;
+		}
+
+		/* Create our IRQs. */
+		port->irq = create_irq();
+		if (port->irq < 0)
+			goto err_uart_dest;
+		tile_irq_activate(port->irq, TILE_IRQ_PERCPU);
+
+		/* Register our IRQs. */
+		ret = request_irq(port->irq, tilegx_interrupt, 0,
+				  tilegx_uart_driver.driver_name, port);
+		if (ret)
+			goto err_dest_irq;
+
+		/* Request that the hardware start sending us interrupts. */
+		tile_uart->irq_cpu = cpu;
+		ret = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu),
+					      KERNEL_PL, port->irq);
+		if (ret)
+			goto err_free_irq;
+
+		/* Enable UART Tx/Rx Interrupt. */
+		intr_mask.word = gxio_uart_read(context, UART_INTERRUPT_MASK);
+		intr_mask.wfifo_re = 0;
+		intr_mask.rfifo_we = 0;
+		gxio_uart_write(context, UART_INTERRUPT_MASK, intr_mask.word);
+
+		/* Reset the Tx/Rx interrupt in case it's set. */
+		gxio_uart_write(context, UART_INTERRUPT_STATUS,
+				UART_INTERRUPT_MASK__WFIFO_RE_MASK |
+				UART_INTERRUPT_MASK__RFIFO_WE_MASK);
+	}
+
+	mutex_unlock(&tile_uart->mutex);
+	return ret;
+
+err_free_irq:
+	free_irq(port->irq, port);
+err_dest_irq:
+	destroy_irq(port->irq);
+err_uart_dest:
+	gxio_uart_destroy(context);
+	ret = -ENXIO;
+err:
+	mutex_unlock(&tile_uart->mutex);
+	return ret;
+}
+
+
+/*
+ * Release kernel resources if it is the last close, disable the port,
+ * free IRQ and close the port.
+ */
+static void tilegx_shutdown(struct uart_port *port)
+{
+	int err;
+	UART_INTERRUPT_MASK_t intr_mask;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+	int cpu;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (mutex_lock_interruptible(&tile_uart->mutex))
+		return;
+	context = &tile_uart->context;
+
+	/* Disable UART Tx/Rx Interrupt. */
+	intr_mask.word = gxio_uart_read(context, UART_INTERRUPT_MASK);
+	intr_mask.wfifo_re = 1;
+	intr_mask.rfifo_we = 1;
+	gxio_uart_write(context, UART_INTERRUPT_MASK, intr_mask.word);
+
+	/* Request that the hardware stop sending us interrupts. */
+	cpu = tile_uart->irq_cpu;
+	err = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu),
+				      KERNEL_PL, -1);
+
+	if (port->irq > 0) {
+		free_irq(port->irq, port);
+		destroy_irq(port->irq);
+		port->irq = 0;
+	}
+
+	gxio_uart_destroy(context);
+
+	mutex_unlock(&tile_uart->mutex);
+}
+
+
+/*
+ * Flush the buffer.
+ */
+static void tilegx_flush_buffer(struct uart_port *port)
+{
+	/* N/A */
+}
+
+
+/*
+ * Change the port parameters.
+ */
+static void tilegx_set_termios(struct uart_port *port,
+			       struct ktermios *termios, struct ktermios *old)
+{
+	int err;
+	UART_DIVISOR_t divisor;
+	UART_TYPE_t type;
+	unsigned int baud;
+	struct tile_uart_port *tile_uart;
+	gxio_uart_context_t *context;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	if (!mutex_trylock(&tile_uart->mutex))
+		return;
+	context = &tile_uart->context;
+
+	/* Open the hypervisor device if we haven't already. */
+	if (context->fd < 0) {
+		err = gxio_uart_init(context, port->line);
+		if (err) {
+			mutex_unlock(&tile_uart->mutex);
+			return;
+		}
+	}
+
+	divisor.word = gxio_uart_read(context, UART_DIVISOR);
+	type.word = gxio_uart_read(context, UART_TYPE);
+
+	/* Divisor. */
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	divisor.divisor = uart_get_divisor(port, baud);
+
+	/* Byte size. */
+	if ((termios->c_cflag & CSIZE) == CS7)
+		type.dbits = UART_TYPE__DBITS_VAL_SEVEN_DBITS;
+	else
+		type.dbits = UART_TYPE__DBITS_VAL_EIGHT_DBITS;
+
+	/* Parity. */
+	if (termios->c_cflag & PARENB) {
+		/* Mark or Space parity. */
+		if (termios->c_cflag & CMSPAR)
+			if (termios->c_cflag & PARODD)
+				type.ptype = UART_TYPE__PTYPE_VAL_MARK;
+			else
+				type.ptype = UART_TYPE__PTYPE_VAL_SPACE;
+		else if (termios->c_cflag & PARODD)
+			type.ptype = UART_TYPE__PTYPE_VAL_ODD;
+		else
+			type.ptype = UART_TYPE__PTYPE_VAL_EVEN;
+	} else
+		type.ptype = UART_TYPE__PTYPE_VAL_NONE;
+
+	/* Stop bits. */
+	if (termios->c_cflag & CSTOPB)
+		type.sbits = UART_TYPE__SBITS_VAL_TWO_SBITS;
+	else
+		type.sbits = UART_TYPE__SBITS_VAL_ONE_SBITS;
+
+	/* Set the uart paramters. */
+	gxio_uart_write(context, UART_DIVISOR, divisor.word);
+	gxio_uart_write(context, UART_TYPE, type.word);
+
+	mutex_unlock(&tile_uart->mutex);
+}
+
+
+/*
+ * Return string describing the specified port.
+ */
+static const char *tilegx_type(struct uart_port *port)
+{
+	return port->type == PORT_TILEGX ? DRIVER_NAME_STRING : NULL;
+}
+
+
+/*
+ * Release the resources being used by 'port'.
+ */
+static void tilegx_release_port(struct uart_port *port)
+{
+	/* Nothing to release. */
+}
+
+
+/*
+ * Request the resources being used by 'port'.
+ */
+static int tilegx_request_port(struct uart_port *port)
+{
+	/* Always present. */
+	return 0;
+}
+
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void tilegx_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE)
+		port->type = PORT_TILEGX;
+}
+
+
+/*
+ * Verify the new serial_struct (for TIOCSSERIAL).
+ */
+static int tilegx_verify_port(struct uart_port *port,
+			      struct serial_struct *ser)
+{
+	if ((ser->type != PORT_UNKNOWN) && (ser->type != PORT_TILEGX))
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+
+/*
+ * Console polling routines for writing and reading from the uart while
+ * in an interrupt or debug context.
+ */
+
+static int tilegx_poll_get_char(struct uart_port *port)
+{
+	UART_FIFO_COUNT_t count;
+	gxio_uart_context_t *context;
+	struct tile_uart_port *tile_uart;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	context = &tile_uart->context;
+	count.word = gxio_uart_read(context, UART_FIFO_COUNT);
+	if (count.rfifo_count == 0)
+		return NO_POLL_CHAR;
+	return (char)gxio_uart_read(context, UART_RECEIVE_DATA);
+}
+
+static void tilegx_poll_put_char(struct uart_port *port, unsigned char c)
+{
+	gxio_uart_context_t *context;
+	struct tile_uart_port *tile_uart;
+
+	tile_uart = container_of(port, struct tile_uart_port, uart);
+	context = &tile_uart->context;
+	gxio_uart_write(context, UART_TRANSMIT_DATA, (unsigned long)c);
+}
+
+#endif /* CONFIG_CONSOLE_POLL */
+
+
+static const struct uart_ops tilegx_ops = {
+	.tx_empty	= tilegx_tx_empty,
+	.set_mctrl	= tilegx_set_mctrl,
+	.get_mctrl	= tilegx_get_mctrl,
+	.stop_tx	= tilegx_stop_tx,
+	.start_tx	= tilegx_start_tx,
+	.stop_rx	= tilegx_stop_rx,
+	.enable_ms	= tilegx_enable_ms,
+	.break_ctl	= tilegx_break_ctl,
+	.startup	= tilegx_startup,
+	.shutdown	= tilegx_shutdown,
+	.flush_buffer	= tilegx_flush_buffer,
+	.set_termios	= tilegx_set_termios,
+	.type		= tilegx_type,
+	.release_port	= tilegx_release_port,
+	.request_port	= tilegx_request_port,
+	.config_port	= tilegx_config_port,
+	.verify_port	= tilegx_verify_port,
+#ifdef CONFIG_CONSOLE_POLL
+	.poll_get_char	= tilegx_poll_get_char,
+	.poll_put_char	= tilegx_poll_put_char,
+#endif
+};
+
+
+static void tilegx_init_ports(void)
+{
+	int i;
+	struct uart_port *port;
+
+	for (i = 0; i < TILEGX_UART_NR; i++) {
+		port = &tile_uart_ports[i].uart;
+		port->ops = &tilegx_ops;
+		port->line = i;
+		port->type = PORT_TILEGX;
+		port->uartclk = TILEGX_UART_REF_CLK;
+		port->flags = UPF_BOOT_AUTOCONF;
+
+		tile_uart_ports[i].context.fd = -1;
+		mutex_init(&tile_uart_ports[i].mutex);
+	}
+}
+
+
+static struct uart_driver tilegx_uart_driver = {
+	.owner		= THIS_MODULE,
+	.driver_name	= DRIVER_NAME_STRING,
+	.dev_name	= TILEGX_UART_NAME,
+	.major		= TILEGX_UART_MAJOR,
+	.minor		= TILEGX_UART_MINOR,
+	.nr		= TILEGX_UART_NR,
+};
+
+
+static int __init tilegx_init(void)
+{
+	int i;
+	int ret;
+	struct tty_driver *tty_drv;
+
+	ret = uart_register_driver(&tilegx_uart_driver);
+	if (ret)
+		return ret;
+	tty_drv = tilegx_uart_driver.tty_driver;
+	tty_drv->init_termios.c_cflag = B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+	tty_drv->init_termios.c_ispeed = 115200;
+	tty_drv->init_termios.c_ospeed = 115200;
+
+	tilegx_init_ports();
+
+	for (i = 0; i < TILEGX_UART_NR; i++) {
+		struct uart_port *port = &tile_uart_ports[i].uart;
+		ret = uart_add_one_port(&tilegx_uart_driver, port);
+	}
+
+	return 0;
+}
+
+
+static void __exit tilegx_exit(void)
+{
+	int i;
+	struct uart_port *port;
+
+	for (i = 0; i < TILEGX_UART_NR; i++) {
+		port = &tile_uart_ports[i].uart;
+		uart_remove_one_port(&tilegx_uart_driver, port);
+	}
+
+	uart_unregister_driver(&tilegx_uart_driver);
+}
+
+
+module_init(tilegx_init);
+module_exit(tilegx_exit);
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_DESCRIPTION("TILEGx serial port driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index e40ebe1..b47dba2 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -235,4 +235,7 @@
 /* ST ASC type numbers */
 #define PORT_ASC       105
 
+/* Tilera TILE-Gx UART */
+#define PORT_TILEGX	106
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index ebf5e0c..366db1a 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -37,6 +37,11 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs)
 			" status = 0x%lx\n",
 		p->addr, regs->cp0_epc, regs->cp0_status);
 #endif
+#ifdef CONFIG_TILEGX
+	printk(KERN_INFO "pre_handler: p->addr = 0x%p, pc = 0x%lx,"
+			" ex1 = 0x%lx\n",
+		p->addr, regs->pc, regs->ex1);
+#endif
 
 	/* A dump_stack() here will give a stack backtrace */
 	return 0;
@@ -58,6 +63,10 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs,
 	printk(KERN_INFO "post_handler: p->addr = 0x%p, status = 0x%lx\n",
 		p->addr, regs->cp0_status);
 #endif
+#ifdef CONFIG_TILEGX
+	printk(KERN_INFO "post_handler: p->addr = 0x%p, ex1 = 0x%lx\n",
+		p->addr, regs->ex1);
+#endif
 }
 
 /*
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 858966a..a674fd5 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -364,6 +364,10 @@ if ($arch eq "x86_64") {
 } elsif ($arch eq "blackfin") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s__mcount\$";
     $mcount_adjust = -4;
+} elsif ($arch eq "tilegx") {
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s__mcount\$";
+    $type = ".quad";
+    $alignment = 8;
 } else {
     die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
 }